RE: [PATCH][AArch64] Implement ACLE Data Intrinsics

2022-10-06 Thread Richard Biener via Gcc-patches
On Wed, 5 Oct 2022, Kyrylo Tkachov wrote:

> 
> 
> > -Original Message-
> > From: Andre Vieira (lists) 
> > Sent: Tuesday, October 4, 2022 11:34 AM
> > To: Kyrylo Tkachov ; gcc-patches@gcc.gnu.org;
> > Richard Sandiford ; Richard Biener
> > 
> > Subject: Re: [PATCH][AArch64] Implement ACLE Data Intrinsics
> > 
> > Hi all,
> > 
> > Can I backport this to gcc-11 branch? Also applies cleanly (with the
> > exception of the file extensions being different: 'aarch64-builtins.cc
> > vs aarch64-builtins.c').
> > 
> 
> Ok by me if testing is clean.

Target patches like this are really up to the maintainers to decide
for backporting.

Richard.


RE: [PATCH][AArch64] Implement ACLE Data Intrinsics

2022-10-05 Thread Kyrylo Tkachov via Gcc-patches


> -Original Message-
> From: Andre Vieira (lists) 
> Sent: Tuesday, October 4, 2022 11:34 AM
> To: Kyrylo Tkachov ; gcc-patches@gcc.gnu.org;
> Richard Sandiford ; Richard Biener
> 
> Subject: Re: [PATCH][AArch64] Implement ACLE Data Intrinsics
> 
> Hi all,
> 
> Can I backport this to gcc-11 branch? Also applies cleanly (with the
> exception of the file extensions being different: 'aarch64-builtins.cc
> vs aarch64-builtins.c').
> 

Ok by me if testing is clean.
Thanks,
Kyrill

> Bootstrapped and regression tested on aarch64-linux-gnu.
> 
> Kind regards,
> Andre Vieira


Re: [PATCH][AArch64] Implement ACLE Data Intrinsics

2022-10-04 Thread Andre Vieira (lists) via Gcc-patches

Hi all,

Can I backport this to gcc-11 branch? Also applies cleanly (with the 
exception of the file extensions being different: 'aarch64-builtins.cc 
vs aarch64-builtins.c').


Bootstrapped and regression tested on aarch64-linux-gnu.

Kind regards,
Andre Vieira


RE: [PATCH][AArch64] Implement ACLE Data Intrinsics

2022-08-11 Thread Kyrylo Tkachov via Gcc-patches


> -Original Message-
> From: Andre Vieira (lists) 
> Sent: Thursday, August 11, 2022 4:11 PM
> To: gcc-patches@gcc.gnu.org; Kyrylo Tkachov ;
> Richard Sandiford ; Richard Biener
> 
> Subject: Re: [PATCH][AArch64] Implement ACLE Data Intrinsics
> 
> OK to backport this to gcc-12? Applies cleanly and did a bootstrat and
> regression test on aarch64-linux-gnu

Ok as long as it's before the branch freeze.
Thanks,
Kyrill

> 
> Regards,
> Andre
> 
> On 01/07/2022 12:26, Richard Sandiford wrote:
> > "Andre Vieira (lists)"  writes:
> >> On 29/06/2022 08:18, Richard Sandiford wrote:
> >>>> +  break;
> >>>> +case AARCH64_RBIT:
> >>>> +case AARCH64_RBITL:
> >>>> +case AARCH64_RBITLL:
> >>>> +  if (mode == SImode)
> >>>> +icode = CODE_FOR_aarch64_rbitsi;
> >>>> +  else
> >>>> +icode = CODE_FOR_aarch64_rbitdi;
> >>>> +  break;
> >>>> +default:
> >>>> +  gcc_unreachable ();
> >>>> +}
> >>>> +  expand_insn (icode, 2, ops);
> >>>> +  return target;
> >>> This needs to return ops[0].value instead, since "target" just suggests
> >>> a possible location.
> >>>
> >>> Could you add tests for a memory source and memory destination, e.g.:
> >>>
> >>> void test_clz_mem (uint32_t *a)
> >>> {
> >>> *a = __clz (*a);
> >>> }
> >>>
> >>> Without tests like that, these comments probably just sound like a paper
> >>> exercise, but they should make a difference for memory sources
> (previous
> >>> review) and memory destinations (this round).
> >> I had locally tested it (with rev though because clz doesn't use that
> >> code) and strangely it does seem to work for the memory destinations,
> >> but that's just a simple test.
> >> It could very well go wrong with some more complex codegen, so I'll just
> >> take your word and use ops[0].value.
> >>
> >> And yeah I didn't add the tests at the time, don't really know why, I'll
> >> chuck it down to laziness :P
> >>>> diff --git a/gcc/config/aarch64/arm_acle.h
> b/gcc/config/aarch64/arm_acle.h
> >>>> index
> 9775a48c65825b424d3eb442384f5ab87b734fd7..a044bc74553fcf2a49b71290
> 083f3f072fd5a2ce 100644
> >>>> --- a/gcc/config/aarch64/arm_acle.h
> >>>> +++ b/gcc/config/aarch64/arm_acle.h
> >>>> @@ -28,6 +28,7 @@
> >>>>#define _GCC_ARM_ACLE_H
> >>>>
> >>>>#include 
> >>>> +#include 
> >>>>
> >>>>#pragma GCC aarch64 "arm_acle.h"
> >>>>
> >>>> @@ -35,6 +36,58 @@
> >>>>extern "C" {
> >>>>#endif
> >>>>
> >>>> +#define _GCC_ARM_ACLE_ROR_FN(NAME, TYPE)
> \
> >>>> +__extension__ extern __inline TYPE
> \
> >>>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> \
> >>>> +NAME (TYPE __value, uint32_t __rotate)
> \
> >>>> +{   
> >>>>   \
> >>>> +  size_t __size = sizeof (TYPE) * __CHAR_BIT__;
> \
> >>>> +  __rotate = __rotate % __size;
> \
> >>>> +  return __value >> __rotate | __value << ((__size - __rotate) % 
> >>>> __size);
> \
> >>>> +}
> >>>> +
> >>>> +_GCC_ARM_ACLE_ROR_FN (__ror, uint32_t)
> >>>> +_GCC_ARM_ACLE_ROR_FN (__rorl, unsigned long)
> >>>> +_GCC_ARM_ACLE_ROR_FN (__rorll, uint64_t)
> >>>> +
> >>>> +#undef _GCC_ARM_ACLE_ROR_FN
> >>>> +
> >>>> +#define _GCC_ARM_ACLE_DATA_FN(NAME, BUILTIN, ITYPE, RTYPE)
>   \
> >>>> +__extension__ extern __inline RTYPE \
> >>>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
> >>>> +__##NAME (ITYPE __value)\
> >>>> +{   \
> >>>> +  return __builtin_##BUILTIN (__value); \
> >>>> +}
> >>>> +
> >>>> +_GCC_ARM_ACLE_DATA_FN (cl

Re: [PATCH][AArch64] Implement ACLE Data Intrinsics

2022-08-11 Thread Andre Vieira (lists) via Gcc-patches
OK to backport this to gcc-12? Applies cleanly and did a bootstrat and 
regression test on aarch64-linux-gnu


Regards,
Andre

On 01/07/2022 12:26, Richard Sandiford wrote:

"Andre Vieira (lists)"  writes:

On 29/06/2022 08:18, Richard Sandiford wrote:

+  break;
+case AARCH64_RBIT:
+case AARCH64_RBITL:
+case AARCH64_RBITLL:
+  if (mode == SImode)
+   icode = CODE_FOR_aarch64_rbitsi;
+  else
+   icode = CODE_FOR_aarch64_rbitdi;
+  break;
+default:
+  gcc_unreachable ();
+}
+  expand_insn (icode, 2, ops);
+  return target;

This needs to return ops[0].value instead, since "target" just suggests
a possible location.

Could you add tests for a memory source and memory destination, e.g.:

void test_clz_mem (uint32_t *a)
{
*a = __clz (*a);
}

Without tests like that, these comments probably just sound like a paper
exercise, but they should make a difference for memory sources (previous
review) and memory destinations (this round).

I had locally tested it (with rev though because clz doesn't use that
code) and strangely it does seem to work for the memory destinations,
but that's just a simple test.
It could very well go wrong with some more complex codegen, so I'll just
take your word and use ops[0].value.

And yeah I didn't add the tests at the time, don't really know why, I'll
chuck it down to laziness :P

diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
index 
9775a48c65825b424d3eb442384f5ab87b734fd7..a044bc74553fcf2a49b71290083f3f072fd5a2ce
 100644
--- a/gcc/config/aarch64/arm_acle.h
+++ b/gcc/config/aarch64/arm_acle.h
@@ -28,6 +28,7 @@
   #define _GCC_ARM_ACLE_H
   
   #include 

+#include 
   
   #pragma GCC aarch64 "arm_acle.h"
   
@@ -35,6 +36,58 @@

   extern "C" {
   #endif
   
+#define _GCC_ARM_ACLE_ROR_FN(NAME, TYPE)  \

+__extension__ extern __inline TYPE   \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))  \
+NAME (TYPE __value, uint32_t __rotate)   \
+{\
+  size_t __size = sizeof (TYPE) * __CHAR_BIT__;
  \
+  __rotate = __rotate % __size;
  \
+  return __value >> __rotate | __value << ((__size - __rotate) % __size); \
+}
+
+_GCC_ARM_ACLE_ROR_FN (__ror, uint32_t)
+_GCC_ARM_ACLE_ROR_FN (__rorl, unsigned long)
+_GCC_ARM_ACLE_ROR_FN (__rorll, uint64_t)
+
+#undef _GCC_ARM_ACLE_ROR_FN
+
+#define _GCC_ARM_ACLE_DATA_FN(NAME, BUILTIN, ITYPE, RTYPE) \
+__extension__ extern __inline RTYPE\
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+__##NAME (ITYPE __value)   \
+{  \
+  return __builtin_##BUILTIN (__value);\
+}
+
+_GCC_ARM_ACLE_DATA_FN (clz, clz, uint32_t, unsigned int)
+_GCC_ARM_ACLE_DATA_FN (clzl, clzl, unsigned long, unsigned int)
+_GCC_ARM_ACLE_DATA_FN (clzll, clzll, uint64_t, unsigned int)
+_GCC_ARM_ACLE_DATA_FN (cls, clrsb, uint32_t, unsigned int)
+_GCC_ARM_ACLE_DATA_FN (clsl, clrsbl, unsigned long, unsigned int)
+_GCC_ARM_ACLE_DATA_FN (clsll, clrsbll, uint64_t, unsigned int)
+_GCC_ARM_ACLE_DATA_FN (rev16, aarch64_rev16, uint32_t, uint32_t)
+_GCC_ARM_ACLE_DATA_FN (rev16l, aarch64_rev16l, unsigned long, unsigned long)
+_GCC_ARM_ACLE_DATA_FN (rev16ll, aarch64_rev16ll, uint64_t, uint64_t)
+_GCC_ARM_ACLE_DATA_FN (rbit, aarch64_rbit, uint32_t, uint32_t)
+_GCC_ARM_ACLE_DATA_FN (rbitl, aarch64_rbitl, unsigned long, unsigned long)
+_GCC_ARM_ACLE_DATA_FN (rbitll, aarch64_rbitll, uint64_t, uint64_t)
+_GCC_ARM_ACLE_DATA_FN (revsh, bswap16, int16_t, uint16_t)

The return type should be int16_t.

Nice catch!

The clz and cls tests have the old return types (same as the argument
types), but I guess that's a good thing, since it shows that we avoid
the redundant zero-extend in clzll and clsll.

Yeah I noticed that too when I was adding the mem tests, but I did
change them though because at the time it just felt like an oversight,
though I too was pleasantly surprised GCC was managing to avoid the
zero-extending :)
I then saw your comment and made me wonder whether I should keep the
wrong return types in... I haven't but happy to change them back if you
think it's a nice 'test' to have.

I thought it was OK/useful as it was, but I don't mind either way.

BTW, while trying it out locally, I noticed:

   aarch64_init_data_intrinsics

was called from the wrong place.  Since it's adding normal __builtin
functions, it should be called from aarch64_general_init_builtins
instead of handle_arm_acle_h.

handle_arm_acle_h is instead for cases where we want to simulate
C/C++ definitions of the ACLE intrinsics themselves (i.e. so that
the intrinsics themselves are built-in functions, rather than
inline 

Re: [PATCH][AArch64] Implement ACLE Data Intrinsics

2022-07-01 Thread Richard Sandiford via Gcc-patches
"Andre Vieira (lists)"  writes:
> On 29/06/2022 08:18, Richard Sandiford wrote:
>>> +  break;
>>> +case AARCH64_RBIT:
>>> +case AARCH64_RBITL:
>>> +case AARCH64_RBITLL:
>>> +  if (mode == SImode)
>>> +   icode = CODE_FOR_aarch64_rbitsi;
>>> +  else
>>> +   icode = CODE_FOR_aarch64_rbitdi;
>>> +  break;
>>> +default:
>>> +  gcc_unreachable ();
>>> +}
>>> +  expand_insn (icode, 2, ops);
>>> +  return target;
>> This needs to return ops[0].value instead, since "target" just suggests
>> a possible location.
>>
>> Could you add tests for a memory source and memory destination, e.g.:
>>
>> void test_clz_mem (uint32_t *a)
>> {
>>*a = __clz (*a);
>> }
>>
>> Without tests like that, these comments probably just sound like a paper
>> exercise, but they should make a difference for memory sources (previous
>> review) and memory destinations (this round).
> I had locally tested it (with rev though because clz doesn't use that 
> code) and strangely it does seem to work for the memory destinations, 
> but that's just a simple test.
> It could very well go wrong with some more complex codegen, so I'll just 
> take your word and use ops[0].value.
>
> And yeah I didn't add the tests at the time, don't really know why, I'll 
> chuck it down to laziness :P
>>
>>> diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
>>> index 
>>> 9775a48c65825b424d3eb442384f5ab87b734fd7..a044bc74553fcf2a49b71290083f3f072fd5a2ce
>>>  100644
>>> --- a/gcc/config/aarch64/arm_acle.h
>>> +++ b/gcc/config/aarch64/arm_acle.h
>>> @@ -28,6 +28,7 @@
>>>   #define _GCC_ARM_ACLE_H
>>>   
>>>   #include 
>>> +#include 
>>>   
>>>   #pragma GCC aarch64 "arm_acle.h"
>>>   
>>> @@ -35,6 +36,58 @@
>>>   extern "C" {
>>>   #endif
>>>   
>>> +#define _GCC_ARM_ACLE_ROR_FN(NAME, TYPE) \
>>> +__extension__ extern __inline TYPE   \
>>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>>>   \
>>> +NAME (TYPE __value, uint32_t __rotate) 
>>>   \
>>> +{\
>>> +  size_t __size = sizeof (TYPE) * __CHAR_BIT__;
>>>   \
>>> +  __rotate = __rotate % __size;
>>>   \
>>> +  return __value >> __rotate | __value << ((__size - __rotate) % __size); \
>>> +}
>>> +
>>> +_GCC_ARM_ACLE_ROR_FN (__ror, uint32_t)
>>> +_GCC_ARM_ACLE_ROR_FN (__rorl, unsigned long)
>>> +_GCC_ARM_ACLE_ROR_FN (__rorll, uint64_t)
>>> +
>>> +#undef _GCC_ARM_ACLE_ROR_FN
>>> +
>>> +#define _GCC_ARM_ACLE_DATA_FN(NAME, BUILTIN, ITYPE, RTYPE) \
>>> +__extension__ extern __inline RTYPE\
>>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
>>> +__##NAME (ITYPE __value)   \
>>> +{  \
>>> +  return __builtin_##BUILTIN (__value);\
>>> +}
>>> +
>>> +_GCC_ARM_ACLE_DATA_FN (clz, clz, uint32_t, unsigned int)
>>> +_GCC_ARM_ACLE_DATA_FN (clzl, clzl, unsigned long, unsigned int)
>>> +_GCC_ARM_ACLE_DATA_FN (clzll, clzll, uint64_t, unsigned int)
>>> +_GCC_ARM_ACLE_DATA_FN (cls, clrsb, uint32_t, unsigned int)
>>> +_GCC_ARM_ACLE_DATA_FN (clsl, clrsbl, unsigned long, unsigned int)
>>> +_GCC_ARM_ACLE_DATA_FN (clsll, clrsbll, uint64_t, unsigned int)
>>> +_GCC_ARM_ACLE_DATA_FN (rev16, aarch64_rev16, uint32_t, uint32_t)
>>> +_GCC_ARM_ACLE_DATA_FN (rev16l, aarch64_rev16l, unsigned long, unsigned 
>>> long)
>>> +_GCC_ARM_ACLE_DATA_FN (rev16ll, aarch64_rev16ll, uint64_t, uint64_t)
>>> +_GCC_ARM_ACLE_DATA_FN (rbit, aarch64_rbit, uint32_t, uint32_t)
>>> +_GCC_ARM_ACLE_DATA_FN (rbitl, aarch64_rbitl, unsigned long, unsigned long)
>>> +_GCC_ARM_ACLE_DATA_FN (rbitll, aarch64_rbitll, uint64_t, uint64_t)
>>> +_GCC_ARM_ACLE_DATA_FN (revsh, bswap16, int16_t, uint16_t)
>> The return type should be int16_t.
> Nice catch!
>> The clz and cls tests have the old return types (same as the argument
>> types), but I guess that's a good thing, since it shows that we avoid
>> the redundant zero-extend in clzll and clsll.
> Yeah I noticed that too when I was adding the mem tests, but I did 
> change them though because at the time it just felt like an oversight, 
> though I too was pleasantly surprised GCC was managing to avoid the 
> zero-extending :)
> I then saw your comment and made me wonder whether I should keep the 
> wrong return types in... I haven't but happy to change them back if you 
> think it's a nice 'test' to have.

I thought it was OK/useful as it was, but I don't mind either way.

BTW, while trying it out locally, I noticed:

  aarch64_init_data_intrinsics

was called from the wrong place.  Since it's adding normal __builtin
functions, it should be called from aarch64_general_init_builtins
instead of handle_arm_acle_h.


Re: [PATCH][AArch64] Implement ACLE Data Intrinsics

2022-07-01 Thread Andre Vieira (lists) via Gcc-patches


On 29/06/2022 08:18, Richard Sandiford wrote:

+  break;
+case AARCH64_RBIT:
+case AARCH64_RBITL:
+case AARCH64_RBITLL:
+  if (mode == SImode)
+   icode = CODE_FOR_aarch64_rbitsi;
+  else
+   icode = CODE_FOR_aarch64_rbitdi;
+  break;
+default:
+  gcc_unreachable ();
+}
+  expand_insn (icode, 2, ops);
+  return target;

This needs to return ops[0].value instead, since "target" just suggests
a possible location.

Could you add tests for a memory source and memory destination, e.g.:

void test_clz_mem (uint32_t *a)
{
   *a = __clz (*a);
}

Without tests like that, these comments probably just sound like a paper
exercise, but they should make a difference for memory sources (previous
review) and memory destinations (this round).
I had locally tested it (with rev though because clz doesn't use that 
code) and strangely it does seem to work for the memory destinations, 
but that's just a simple test.
It could very well go wrong with some more complex codegen, so I'll just 
take your word and use ops[0].value.


And yeah I didn't add the tests at the time, don't really know why, I'll 
chuck it down to laziness :P



diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
index 
9775a48c65825b424d3eb442384f5ab87b734fd7..a044bc74553fcf2a49b71290083f3f072fd5a2ce
 100644
--- a/gcc/config/aarch64/arm_acle.h
+++ b/gcc/config/aarch64/arm_acle.h
@@ -28,6 +28,7 @@
  #define _GCC_ARM_ACLE_H
  
  #include 

+#include 
  
  #pragma GCC aarch64 "arm_acle.h"
  
@@ -35,6 +36,58 @@

  extern "C" {
  #endif
  
+#define _GCC_ARM_ACLE_ROR_FN(NAME, TYPE)  \

+__extension__ extern __inline TYPE   \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))  \
+NAME (TYPE __value, uint32_t __rotate)   \
+{\
+  size_t __size = sizeof (TYPE) * __CHAR_BIT__;
  \
+  __rotate = __rotate % __size;
  \
+  return __value >> __rotate | __value << ((__size - __rotate) % __size); \
+}
+
+_GCC_ARM_ACLE_ROR_FN (__ror, uint32_t)
+_GCC_ARM_ACLE_ROR_FN (__rorl, unsigned long)
+_GCC_ARM_ACLE_ROR_FN (__rorll, uint64_t)
+
+#undef _GCC_ARM_ACLE_ROR_FN
+
+#define _GCC_ARM_ACLE_DATA_FN(NAME, BUILTIN, ITYPE, RTYPE) \
+__extension__ extern __inline RTYPE\
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+__##NAME (ITYPE __value)   \
+{  \
+  return __builtin_##BUILTIN (__value);\
+}
+
+_GCC_ARM_ACLE_DATA_FN (clz, clz, uint32_t, unsigned int)
+_GCC_ARM_ACLE_DATA_FN (clzl, clzl, unsigned long, unsigned int)
+_GCC_ARM_ACLE_DATA_FN (clzll, clzll, uint64_t, unsigned int)
+_GCC_ARM_ACLE_DATA_FN (cls, clrsb, uint32_t, unsigned int)
+_GCC_ARM_ACLE_DATA_FN (clsl, clrsbl, unsigned long, unsigned int)
+_GCC_ARM_ACLE_DATA_FN (clsll, clrsbll, uint64_t, unsigned int)
+_GCC_ARM_ACLE_DATA_FN (rev16, aarch64_rev16, uint32_t, uint32_t)
+_GCC_ARM_ACLE_DATA_FN (rev16l, aarch64_rev16l, unsigned long, unsigned long)
+_GCC_ARM_ACLE_DATA_FN (rev16ll, aarch64_rev16ll, uint64_t, uint64_t)
+_GCC_ARM_ACLE_DATA_FN (rbit, aarch64_rbit, uint32_t, uint32_t)
+_GCC_ARM_ACLE_DATA_FN (rbitl, aarch64_rbitl, unsigned long, unsigned long)
+_GCC_ARM_ACLE_DATA_FN (rbitll, aarch64_rbitll, uint64_t, uint64_t)
+_GCC_ARM_ACLE_DATA_FN (revsh, bswap16, int16_t, uint16_t)

The return type should be int16_t.

Nice catch!

The clz and cls tests have the old return types (same as the argument
types), but I guess that's a good thing, since it shows that we avoid
the redundant zero-extend in clzll and clsll.
Yeah I noticed that too when I was adding the mem tests, but I did 
change them though because at the time it just felt like an oversight, 
though I too was pleasantly surprised GCC was managing to avoid the 
zero-extending :)
I then saw your comment and made me wonder whether I should keep the 
wrong return types in... I haven't but happy to change them back if you 
think it's a nice 'test' to have.


Regards,
Andre
diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 
e0a741ac663188713e21f457affa57217d074783..bb5d97c8fc6402635270df851a949cabeecaa5e8
 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -613,6 +613,12 @@ enum aarch64_builtins
   AARCH64_LS64_BUILTIN_ST64B,
   AARCH64_LS64_BUILTIN_ST64BV,
   AARCH64_LS64_BUILTIN_ST64BV0,
+  AARCH64_REV16,
+  AARCH64_REV16L,
+  AARCH64_REV16LL,
+  AARCH64_RBIT,
+  AARCH64_RBITL,
+  AARCH64_RBITLL,
   AARCH64_BUILTIN_MAX
 };
 
@@ -1664,10 +1670,41 @@ aarch64_init_ls64_builtins (void)
   = aarch64_general_add_builtin (data[i].name, data[i].type, data[i].code);
 

Re: [PATCH][AArch64] Implement ACLE Data Intrinsics

2022-06-29 Thread Richard Sandiford via Gcc-patches
"Andre Vieira (lists)"  writes:
> On 17/06/2022 11:54, Richard Sandiford wrote:
>> "Andre Vieira (lists)"  writes:
>>> Hi,
>>>
>>> This patch adds support for the ACLE Data Intrinsics to the AArch64 port.
>>>
>>> Bootstrapped and regression tested on aarch64-none-linux.
>>>
>>> OK for trunk?
>> Sorry for the slow review.
> No worries :)
>>
>>> +{\
>>> +  size_t size = sizeof (TYPE) * __CHAR_BIT__;\
>>> +  rotate = rotate % size;\
>>> +  return value >> rotate | value << (size - rotate); \
>> This runs into UB for rotate == 0.
> I assume it's because of the value << size no?

Yeah.

> I added a modulo, I assume it's legal to shift by 0?

Thanks, and yeah, shifting by zero is fine.

> This OK?
>
> diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
> b/gcc/config/aarch64/aarch64-builtins.cc
> index 
> e0a741ac663188713e21f457affa57217d074783..69f1fb3604a481fa378d105cf3ee98edec1ba619
>  100644
> --- a/gcc/config/aarch64/aarch64-builtins.cc
> +++ b/gcc/config/aarch64/aarch64-builtins.cc
> @@ -613,6 +613,12 @@ enum aarch64_builtins
>AARCH64_LS64_BUILTIN_ST64B,
>AARCH64_LS64_BUILTIN_ST64BV,
>AARCH64_LS64_BUILTIN_ST64BV0,
> +  AARCH64_REV16,
> +  AARCH64_REV16L,
> +  AARCH64_REV16LL,
> +  AARCH64_RBIT,
> +  AARCH64_RBITL,
> +  AARCH64_RBITLL,
>AARCH64_BUILTIN_MAX
>  };
>  
> @@ -1664,10 +1670,41 @@ aarch64_init_ls64_builtins (void)
>= aarch64_general_add_builtin (data[i].name, data[i].type, 
> data[i].code);
>  }
>  
> +static void
> +aarch64_init_data_intrinsics (void)
> +{
> +  tree uint32_fntype = build_function_type_list (uint32_type_node,
> +  uint32_type_node, NULL_TREE);
> +  tree ulong_fntype = build_function_type_list (long_unsigned_type_node,
> + long_unsigned_type_node,
> + NULL_TREE);
> +  tree uint64_fntype = build_function_type_list (uint64_type_node,
> +  uint64_type_node, NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_REV16]
> += aarch64_general_add_builtin ("__builtin_aarch64_rev16", uint32_fntype,
> +AARCH64_REV16);
> +  aarch64_builtin_decls[AARCH64_REV16L]
> += aarch64_general_add_builtin ("__builtin_aarch64_rev16l", ulong_fntype,
> +AARCH64_REV16L);
> +  aarch64_builtin_decls[AARCH64_REV16LL]
> += aarch64_general_add_builtin ("__builtin_aarch64_rev16ll", 
> uint64_fntype,
> +AARCH64_REV16LL);
> +  aarch64_builtin_decls[AARCH64_RBIT]
> += aarch64_general_add_builtin ("__builtin_aarch64_rbit", uint32_fntype,
> +AARCH64_RBIT);
> +  aarch64_builtin_decls[AARCH64_RBITL]
> += aarch64_general_add_builtin ("__builtin_aarch64_rbitl", ulong_fntype,
> +AARCH64_RBITL);
> +  aarch64_builtin_decls[AARCH64_RBITLL]
> += aarch64_general_add_builtin ("__builtin_aarch64_rbitll", uint64_fntype,
> +AARCH64_RBITLL);
> +}
> +
>  /* Implement #pragma GCC aarch64 "arm_acle.h".  */
>  void
>  handle_arm_acle_h (void)
>  {
> +  aarch64_init_data_intrinsics ();
>if (TARGET_LS64)
>  aarch64_init_ls64_builtins ();
>  }
> @@ -2393,6 +2430,40 @@ aarch64_expand_builtin_memtag (int fcode, tree exp, 
> rtx target)
>emit_insn (pat);
>return target;
>  }

Nit: missing blank line here.

> +/* Function to expand an expression EXP which calls one of the ACLE Data
> +   Intrinsic builtins FCODE with the result going to TARGET.  */
> +static rtx
> +aarch64_expand_builtin_data_intrinsic (unsigned int fcode, tree exp, rtx 
> target)
> +{
> +  expand_operand ops[2];
> +  machine_mode mode = GET_MODE (target);
> +  create_output_operand ([0], target, mode);
> +  create_input_operand ([1], expand_normal (CALL_EXPR_ARG (exp, 0)), 
> mode);
> +  enum insn_code icode;
> +  switch (fcode)
> +{
> +case AARCH64_REV16:
> +case AARCH64_REV16L:
> +case AARCH64_REV16LL:
> +  if (mode == SImode)
> + icode = CODE_FOR_aarch64_rev16si;
> +  else
> + icode = CODE_FOR_aarch64_rev16di;

You should be able to do:

  icode = code_for_aarch64_rev (mode);

instead.  Same for the next cases.

> +  break;
> +case AARCH64_RBIT:
> +case AARCH64_RBITL:
> +case AARCH64_RBITLL:
> +  if (mode == SImode)
> + icode = CODE_FOR_aarch64_rbitsi;
> +  else
> + icode = CODE_FOR_aarch64_rbitdi;
> +  break;
> +default:
> +  gcc_unreachable ();
> +}
> +  expand_insn (icode, 2, ops);
> +  return target;

This needs to return ops[0].value instead, since "target" just suggests
a possible location.

Could you add tests for a memory source and memory destination, e.g.:

void test_clz_mem (uint32_t *a)
{
  *a = __clz (*a);
}


Re: [PATCH][AArch64] Implement ACLE Data Intrinsics

2022-06-28 Thread Andre Vieira (lists) via Gcc-patches


On 17/06/2022 11:54, Richard Sandiford wrote:

"Andre Vieira (lists)"  writes:

Hi,

This patch adds support for the ACLE Data Intrinsics to the AArch64 port.

Bootstrapped and regression tested on aarch64-none-linux.

OK for trunk?

Sorry for the slow review.

No worries :)



+{\
+  size_t size = sizeof (TYPE) * __CHAR_BIT__;\
+  rotate = rotate % size;\
+  return value >> rotate | value << (size - rotate); \

This runs into UB for rotate == 0.
I assume it's because of the value << size no? I added a modulo, I 
assume it's legal to shift by 0?


This OK?
diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 
e0a741ac663188713e21f457affa57217d074783..69f1fb3604a481fa378d105cf3ee98edec1ba619
 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -613,6 +613,12 @@ enum aarch64_builtins
   AARCH64_LS64_BUILTIN_ST64B,
   AARCH64_LS64_BUILTIN_ST64BV,
   AARCH64_LS64_BUILTIN_ST64BV0,
+  AARCH64_REV16,
+  AARCH64_REV16L,
+  AARCH64_REV16LL,
+  AARCH64_RBIT,
+  AARCH64_RBITL,
+  AARCH64_RBITLL,
   AARCH64_BUILTIN_MAX
 };
 
@@ -1664,10 +1670,41 @@ aarch64_init_ls64_builtins (void)
   = aarch64_general_add_builtin (data[i].name, data[i].type, data[i].code);
 }
 
+static void
+aarch64_init_data_intrinsics (void)
+{
+  tree uint32_fntype = build_function_type_list (uint32_type_node,
+uint32_type_node, NULL_TREE);
+  tree ulong_fntype = build_function_type_list (long_unsigned_type_node,
+   long_unsigned_type_node,
+   NULL_TREE);
+  tree uint64_fntype = build_function_type_list (uint64_type_node,
+uint64_type_node, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_REV16]
+= aarch64_general_add_builtin ("__builtin_aarch64_rev16", uint32_fntype,
+  AARCH64_REV16);
+  aarch64_builtin_decls[AARCH64_REV16L]
+= aarch64_general_add_builtin ("__builtin_aarch64_rev16l", ulong_fntype,
+  AARCH64_REV16L);
+  aarch64_builtin_decls[AARCH64_REV16LL]
+= aarch64_general_add_builtin ("__builtin_aarch64_rev16ll", uint64_fntype,
+  AARCH64_REV16LL);
+  aarch64_builtin_decls[AARCH64_RBIT]
+= aarch64_general_add_builtin ("__builtin_aarch64_rbit", uint32_fntype,
+  AARCH64_RBIT);
+  aarch64_builtin_decls[AARCH64_RBITL]
+= aarch64_general_add_builtin ("__builtin_aarch64_rbitl", ulong_fntype,
+  AARCH64_RBITL);
+  aarch64_builtin_decls[AARCH64_RBITLL]
+= aarch64_general_add_builtin ("__builtin_aarch64_rbitll", uint64_fntype,
+  AARCH64_RBITLL);
+}
+
 /* Implement #pragma GCC aarch64 "arm_acle.h".  */
 void
 handle_arm_acle_h (void)
 {
+  aarch64_init_data_intrinsics ();
   if (TARGET_LS64)
 aarch64_init_ls64_builtins ();
 }
@@ -2393,6 +2430,40 @@ aarch64_expand_builtin_memtag (int fcode, tree exp, rtx 
target)
   emit_insn (pat);
   return target;
 }
+/* Function to expand an expression EXP which calls one of the ACLE Data
+   Intrinsic builtins FCODE with the result going to TARGET.  */
+static rtx
+aarch64_expand_builtin_data_intrinsic (unsigned int fcode, tree exp, rtx 
target)
+{
+  expand_operand ops[2];
+  machine_mode mode = GET_MODE (target);
+  create_output_operand ([0], target, mode);
+  create_input_operand ([1], expand_normal (CALL_EXPR_ARG (exp, 0)), mode);
+  enum insn_code icode;
+  switch (fcode)
+{
+case AARCH64_REV16:
+case AARCH64_REV16L:
+case AARCH64_REV16LL:
+  if (mode == SImode)
+   icode = CODE_FOR_aarch64_rev16si;
+  else
+   icode = CODE_FOR_aarch64_rev16di;
+  break;
+case AARCH64_RBIT:
+case AARCH64_RBITL:
+case AARCH64_RBITLL:
+  if (mode == SImode)
+   icode = CODE_FOR_aarch64_rbitsi;
+  else
+   icode = CODE_FOR_aarch64_rbitdi;
+  break;
+default:
+  gcc_unreachable ();
+}
+  expand_insn (icode, 2, ops);
+  return target;
+}
 
 /* Expand an expression EXP as fpsr or fpcr setter (depending on
UNSPEC) using MODE.  */
@@ -2551,6 +2622,9 @@ aarch64_general_expand_builtin (unsigned int fcode, tree 
exp, rtx target,
   if (fcode >= AARCH64_MEMTAG_BUILTIN_START
   && fcode <= AARCH64_MEMTAG_BUILTIN_END)
 return aarch64_expand_builtin_memtag (fcode, exp, target);
+  if (fcode >= AARCH64_REV16
+  && fcode <= AARCH64_RBITLL)
+return aarch64_expand_builtin_data_intrinsic (fcode, exp, target);
 
   gcc_unreachable ();
 }
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 

Re: [PATCH][AArch64] Implement ACLE Data Intrinsics

2022-06-17 Thread Richard Sandiford via Gcc-patches
"Andre Vieira (lists)"  writes:
> Hi,
>
> This patch adds support for the ACLE Data Intrinsics to the AArch64 port.
>
> Bootstrapped and regression tested on aarch64-none-linux.
>
> OK for trunk?

Sorry for the slow review.

>
> gcc/ChangeLog:
>
> 2022-06-10  Andre Vieira  
>
>      * config/aarch64/aarch64.md (rbit2): Rename this ...
>      (@aarch64_rbit): ... this and change it in...
>      (ffs2,ctz2): ... here.
>      (@aarch64_rev16): New.
>      * config/aarch64/aarch64-builtins.cc: (aarch64_builtins):
>      Define the following enum AARCH64_REV16, AARCH64_REV16L, 
> AARCH64_REV16LL,
>      AARCH64_RBIT, AARCH64_RBITL, AARCH64_RBITLL.
>      (aarch64_init_data_intrinsics): New.
>      (handle_arm_acle_h): Add call to aarch64_init_data_intrinsics.
>      (aarch64_expand_builtin_data_intrinsic): New.
>      (aarch64_general_expand_builtin): Add call to 
> aarch64_expand_builtin_data_intrinsic.
>      * config/aarch64/arm_acle.h (__clz, __clzl, __clzll, __cls, 
> __clsl, __clsll, __rbit,
>      __rbitl, __rbitll, __rev, __revl, __revll, __rev16, __rev16l, 
> __rev16ll, __ror, __rorl,
>      __rorll, __revsh): New.
>
> gcc/testsuite/ChangeLog:
>
> 2022-06-10  Andre Vieira  
>
>      * gcc.target/aarch64/acle/data-intrinsics.c: New test.
>
> diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
> b/gcc/config/aarch64/aarch64-builtins.cc
> index 
> e0a741ac663188713e21f457affa57217d074783..91a687dee13a27c21f0c50de9ba777aa900d6096
>  100644
> --- a/gcc/config/aarch64/aarch64-builtins.cc
> +++ b/gcc/config/aarch64/aarch64-builtins.cc
> @@ -613,6 +613,12 @@ enum aarch64_builtins
>AARCH64_LS64_BUILTIN_ST64B,
>AARCH64_LS64_BUILTIN_ST64BV,
>AARCH64_LS64_BUILTIN_ST64BV0,
> +  AARCH64_REV16,
> +  AARCH64_REV16L,
> +  AARCH64_REV16LL,
> +  AARCH64_RBIT,
> +  AARCH64_RBITL,
> +  AARCH64_RBITLL,
>AARCH64_BUILTIN_MAX
>  };
>  
> @@ -1664,10 +1670,41 @@ aarch64_init_ls64_builtins (void)
>= aarch64_general_add_builtin (data[i].name, data[i].type, 
> data[i].code);
>  }
>  
> +static void
> +aarch64_init_data_intrinsics (void)
> +{
> +  tree uint32_fntype = build_function_type_list (uint32_type_node,
> +  uint32_type_node, NULL_TREE);
> +  tree long_fntype = build_function_type_list (long_unsigned_type_node,
> +long_unsigned_type_node,
> +NULL_TREE);

Very minor, but ulong_fntype might be clearer, since the other two
variable names are explicitly unsigned.

> +  tree uint64_fntype = build_function_type_list (uint64_type_node,
> +  uint64_type_node, NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_REV16]
> += aarch64_general_add_builtin ("__builtin_aarch64_rev16", uint32_fntype,
> +AARCH64_REV16);
> +  aarch64_builtin_decls[AARCH64_REV16L]
> += aarch64_general_add_builtin ("__builtin_aarch64_rev16l", long_fntype,
> +AARCH64_REV16L);
> +  aarch64_builtin_decls[AARCH64_REV16LL]
> += aarch64_general_add_builtin ("__builtin_aarch64_rev16ll", 
> uint64_fntype,
> +AARCH64_REV16LL);
> +  aarch64_builtin_decls[AARCH64_RBIT]
> += aarch64_general_add_builtin ("__builtin_aarch64_rbit", uint32_fntype,
> +AARCH64_RBIT);
> +  aarch64_builtin_decls[AARCH64_RBITL]
> += aarch64_general_add_builtin ("__builtin_aarch64_rbitl", long_fntype,
> +AARCH64_RBITL);
> +  aarch64_builtin_decls[AARCH64_RBITLL]
> += aarch64_general_add_builtin ("__builtin_aarch64_rbitll", uint64_fntype,
> +AARCH64_RBITLL);
> +}
> +
>  /* Implement #pragma GCC aarch64 "arm_acle.h".  */
>  void
>  handle_arm_acle_h (void)
>  {
> +  aarch64_init_data_intrinsics ();
>if (TARGET_LS64)
>  aarch64_init_ls64_builtins ();
>  }
> @@ -2393,6 +2430,32 @@ aarch64_expand_builtin_memtag (int fcode, tree exp, 
> rtx target)
>emit_insn (pat);
>return target;
>  }
> +/* Function to expand an expression EXP which calls one of the ACLE Data
> +   Intrinsic builtins FCODE with the result going to TARGET.  */
> +static rtx
> +aarch64_expand_builtin_data_intrinsic (unsigned int fcode, tree exp, rtx 
> target)
> +{
> +  rtx op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
> +  machine_mode mode = GET_MODE (op0);
> +  rtx pat;
> +  switch (fcode)
> +{
> +case AARCH64_REV16:
> +case AARCH64_REV16L:
> +case AARCH64_REV16LL:
> +  pat = gen_aarch64_rev16 (mode, target, op0);

Does this work when op0 is a constant or comes from memory?
Same for when target is a memory.  E.g. does:

void test_rev16 (uint32_t *ptr)
{
  *ptr = __rev16 (*ptr);
}

work?

It'd be more robust to use the expand_insn interface instead;
see aarch64_expand_builtin_ls64 for an example.

> +  break;
> +case AARCH64_RBIT:
> 

[PATCH][AArch64] Implement ACLE Data Intrinsics

2022-06-10 Thread Andre Vieira (lists) via Gcc-patches

Hi,

This patch adds support for the ACLE Data Intrinsics to the AArch64 port.

Bootstrapped and regression tested on aarch64-none-linux.

OK for trunk?

gcc/ChangeLog:

2022-06-10  Andre Vieira  

    * config/aarch64/aarch64.md (rbit2): Rename this ...
    (@aarch64_rbit): ... this and change it in...
    (ffs2,ctz2): ... here.
    (@aarch64_rev16): New.
    * config/aarch64/aarch64-builtins.cc: (aarch64_builtins):
    Define the following enum AARCH64_REV16, AARCH64_REV16L, 
AARCH64_REV16LL,

    AARCH64_RBIT, AARCH64_RBITL, AARCH64_RBITLL.
    (aarch64_init_data_intrinsics): New.
    (handle_arm_acle_h): Add call to aarch64_init_data_intrinsics.
    (aarch64_expand_builtin_data_intrinsic): New.
    (aarch64_general_expand_builtin): Add call to 
aarch64_expand_builtin_data_intrinsic.
    * config/aarch64/arm_acle.h (__clz, __clzl, __clzll, __cls, 
__clsl, __clsll, __rbit,
    __rbitl, __rbitll, __rev, __revl, __revll, __rev16, __rev16l, 
__rev16ll, __ror, __rorl,

    __rorll, __revsh): New.

gcc/testsuite/ChangeLog:

2022-06-10  Andre Vieira  

    * gcc.target/aarch64/acle/data-intrinsics.c: New test.
diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 
e0a741ac663188713e21f457affa57217d074783..91a687dee13a27c21f0c50de9ba777aa900d6096
 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -613,6 +613,12 @@ enum aarch64_builtins
   AARCH64_LS64_BUILTIN_ST64B,
   AARCH64_LS64_BUILTIN_ST64BV,
   AARCH64_LS64_BUILTIN_ST64BV0,
+  AARCH64_REV16,
+  AARCH64_REV16L,
+  AARCH64_REV16LL,
+  AARCH64_RBIT,
+  AARCH64_RBITL,
+  AARCH64_RBITLL,
   AARCH64_BUILTIN_MAX
 };
 
@@ -1664,10 +1670,41 @@ aarch64_init_ls64_builtins (void)
   = aarch64_general_add_builtin (data[i].name, data[i].type, data[i].code);
 }
 
+static void
+aarch64_init_data_intrinsics (void)
+{
+  tree uint32_fntype = build_function_type_list (uint32_type_node,
+uint32_type_node, NULL_TREE);
+  tree long_fntype = build_function_type_list (long_unsigned_type_node,
+  long_unsigned_type_node,
+  NULL_TREE);
+  tree uint64_fntype = build_function_type_list (uint64_type_node,
+uint64_type_node, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_REV16]
+= aarch64_general_add_builtin ("__builtin_aarch64_rev16", uint32_fntype,
+  AARCH64_REV16);
+  aarch64_builtin_decls[AARCH64_REV16L]
+= aarch64_general_add_builtin ("__builtin_aarch64_rev16l", long_fntype,
+  AARCH64_REV16L);
+  aarch64_builtin_decls[AARCH64_REV16LL]
+= aarch64_general_add_builtin ("__builtin_aarch64_rev16ll", uint64_fntype,
+  AARCH64_REV16LL);
+  aarch64_builtin_decls[AARCH64_RBIT]
+= aarch64_general_add_builtin ("__builtin_aarch64_rbit", uint32_fntype,
+  AARCH64_RBIT);
+  aarch64_builtin_decls[AARCH64_RBITL]
+= aarch64_general_add_builtin ("__builtin_aarch64_rbitl", long_fntype,
+  AARCH64_RBITL);
+  aarch64_builtin_decls[AARCH64_RBITLL]
+= aarch64_general_add_builtin ("__builtin_aarch64_rbitll", uint64_fntype,
+  AARCH64_RBITLL);
+}
+
 /* Implement #pragma GCC aarch64 "arm_acle.h".  */
 void
 handle_arm_acle_h (void)
 {
+  aarch64_init_data_intrinsics ();
   if (TARGET_LS64)
 aarch64_init_ls64_builtins ();
 }
@@ -2393,6 +2430,32 @@ aarch64_expand_builtin_memtag (int fcode, tree exp, rtx 
target)
   emit_insn (pat);
   return target;
 }
+/* Function to expand an expression EXP which calls one of the ACLE Data
+   Intrinsic builtins FCODE with the result going to TARGET.  */
+static rtx
+aarch64_expand_builtin_data_intrinsic (unsigned int fcode, tree exp, rtx 
target)
+{
+  rtx op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
+  machine_mode mode = GET_MODE (op0);
+  rtx pat;
+  switch (fcode)
+{
+case AARCH64_REV16:
+case AARCH64_REV16L:
+case AARCH64_REV16LL:
+  pat = gen_aarch64_rev16 (mode, target, op0);
+  break;
+case AARCH64_RBIT:
+case AARCH64_RBITL:
+case AARCH64_RBITLL:
+  pat = gen_aarch64_rbit (mode, target, op0);
+  break;
+default:
+  gcc_unreachable ();
+}
+  emit_insn (pat);
+  return target;
+}
 
 /* Expand an expression EXP as fpsr or fpcr setter (depending on
UNSPEC) using MODE.  */
@@ -2551,6 +2614,9 @@ aarch64_general_expand_builtin (unsigned int fcode, tree 
exp, rtx target,
   if (fcode >= AARCH64_MEMTAG_BUILTIN_START
   && fcode <= AARCH64_MEMTAG_BUILTIN_END)
 return aarch64_expand_builtin_memtag (fcode, exp, target);
+  if (fcode >= AARCH64_REV16
+  && fcode <= AARCH64_RBITLL)
+return aarch64_expand_builtin_data_intrinsic (fcode, exp, target);