Re: [PATCH 21/31] target/arm: Implement SVE2 integer absolute difference and accumulate long

2020-04-14 Thread Laurent Desnogues
On Tue, Apr 14, 2020 at 1:19 AM Richard Henderson
 wrote:
>
> On 4/13/20 9:15 AM, Laurent Desnogues wrote:
> > On Fri, Mar 27, 2020 at 12:18 AM Richard Henderson
> >  wrote:
> > [...]
> >> diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
> >> index a3653007ac..a0995d95c7 100644
> >> --- a/target/arm/sve_helper.c
> >> +++ b/target/arm/sve_helper.c
> >> @@ -1216,6 +1216,30 @@ DO_ZZZ_NTB(sve2_eoril_d, uint64_t, , DO_EOR)
> >>
> >>  #undef DO_ZZZ_NTB
> >>
> >> +#define DO_ABAL(NAME, TYPE, TYPEN) \
> >> +void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, uint32_t desc) \
> >> +{  \
> >> +intptr_t i, opr_sz = simd_oprsz(desc); \
> >> +int sel1 = (simd_data(desc) & 1) * sizeof(TYPE);   \
> >> +int sel2 = (simd_data(desc) & 2) * (sizeof(TYPE) / 2); \
> >> +for (i = 0; i < opr_sz; i += sizeof(TYPE)) {   \
> >> +TYPE nn = (TYPEN)(*(TYPE *)(vn + i) >> sel1);  \
> >> +TYPE mm = (TYPEN)(*(TYPE *)(vm + i) >> sel2);  \
> >> +TYPE aa = *(TYPE *)(va + i);   \
> >> +*(TYPE *)(vd + i) = DO_ABD(nn, mm) + aa;   \
> >> +}  \
> >> +}
> >
> > ABAL is either top or bottom not a mix of two.  So only sel1 is needed
> > and its multiplicand should be the number of bits of TYPEN.
>
> Yep.
>
> > vd is both a source and a destination so a temporary should be used.
>
> In what way am I not?  Both sources are read before the write.  The operands
> are all in columns of the wide type (unlike the addp case you pointed out).

You're right, sorry.

Laurent



Re: [PATCH 21/31] target/arm: Implement SVE2 integer absolute difference and accumulate long

2020-04-13 Thread Richard Henderson
On 4/13/20 9:15 AM, Laurent Desnogues wrote:
> On Fri, Mar 27, 2020 at 12:18 AM Richard Henderson
>  wrote:
> [...]
>> diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
>> index a3653007ac..a0995d95c7 100644
>> --- a/target/arm/sve_helper.c
>> +++ b/target/arm/sve_helper.c
>> @@ -1216,6 +1216,30 @@ DO_ZZZ_NTB(sve2_eoril_d, uint64_t, , DO_EOR)
>>
>>  #undef DO_ZZZ_NTB
>>
>> +#define DO_ABAL(NAME, TYPE, TYPEN) \
>> +void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, uint32_t desc) \
>> +{  \
>> +intptr_t i, opr_sz = simd_oprsz(desc); \
>> +int sel1 = (simd_data(desc) & 1) * sizeof(TYPE);   \
>> +int sel2 = (simd_data(desc) & 2) * (sizeof(TYPE) / 2); \
>> +for (i = 0; i < opr_sz; i += sizeof(TYPE)) {   \
>> +TYPE nn = (TYPEN)(*(TYPE *)(vn + i) >> sel1);  \
>> +TYPE mm = (TYPEN)(*(TYPE *)(vm + i) >> sel2);  \
>> +TYPE aa = *(TYPE *)(va + i);   \
>> +*(TYPE *)(vd + i) = DO_ABD(nn, mm) + aa;   \
>> +}  \
>> +}
> 
> ABAL is either top or bottom not a mix of two.  So only sel1 is needed
> and its multiplicand should be the number of bits of TYPEN.

Yep.

> vd is both a source and a destination so a temporary should be used.

In what way am I not?  Both sources are read before the write.  The operands
are all in columns of the wide type (unlike the addp case you pointed out).


r~



Re: [PATCH 21/31] target/arm: Implement SVE2 integer absolute difference and accumulate long

2020-04-13 Thread Laurent Desnogues
On Fri, Mar 27, 2020 at 12:18 AM Richard Henderson
 wrote:
[...]
> diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
> index a3653007ac..a0995d95c7 100644
> --- a/target/arm/sve_helper.c
> +++ b/target/arm/sve_helper.c
> @@ -1216,6 +1216,30 @@ DO_ZZZ_NTB(sve2_eoril_d, uint64_t, , DO_EOR)
>
>  #undef DO_ZZZ_NTB
>
> +#define DO_ABAL(NAME, TYPE, TYPEN) \
> +void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, uint32_t desc) \
> +{  \
> +intptr_t i, opr_sz = simd_oprsz(desc); \
> +int sel1 = (simd_data(desc) & 1) * sizeof(TYPE);   \
> +int sel2 = (simd_data(desc) & 2) * (sizeof(TYPE) / 2); \
> +for (i = 0; i < opr_sz; i += sizeof(TYPE)) {   \
> +TYPE nn = (TYPEN)(*(TYPE *)(vn + i) >> sel1);  \
> +TYPE mm = (TYPEN)(*(TYPE *)(vm + i) >> sel2);  \
> +TYPE aa = *(TYPE *)(va + i);   \
> +*(TYPE *)(vd + i) = DO_ABD(nn, mm) + aa;   \
> +}  \
> +}

ABAL is either top or bottom not a mix of two.  So only sel1 is needed
and its multiplicand should be the number of bits of TYPEN.
vd is both a source and a destination so a temporary should be used.

Laurent



[PATCH 21/31] target/arm: Implement SVE2 integer absolute difference and accumulate long

2020-03-26 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper-sve.h| 14 ++
 target/arm/sve.decode  | 12 +
 target/arm/sve_helper.c| 24 +
 target/arm/translate-sve.c | 54 ++
 4 files changed, 104 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 0e4b4c48da..b48a88135f 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -2410,3 +2410,17 @@ DEF_HELPER_FLAGS_4(sve2_sqcadd_b, TCG_CALL_NO_RWG, void, 
ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve2_sqcadd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve2_sqcadd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(sve2_sqcadd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_sabal_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sabal_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_sabal_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2_uabal_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uabal_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve2_uabal_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 5fb4b5f977..f66a6c242f 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -70,6 +70,7 @@
 _s  rd pg rn s
 _s rd pg rn rm s
 _esz   rd pg rn rm esz
+_esz   rd ra rn rm esz
 _esz  rd pg rn rm ra esz
 _esz   rd pg rn imm esz
   rd esz pat s
@@ -120,6 +121,10 @@
 @rdn_i8s esz:2 .. ... imm:s8 rd:5 \
 _esz rn=%reg_movprfx
 
+# Four operand, vector element size
+@rda_rn_rm   esz:2 . rm:5 ... ... rn:5 rd:5 \
+_esz ra=%reg_movprfx
+
 # Three operand with "memory" size, aka immediate left shift
 @rd_rn_msz_rm    ... rm:5  imm:2 rn:5 rd:5  
 
@@ -1235,3 +1240,10 @@ CADD_rot90  01000101 .. 0 0 11011 0 . .  
@rdn_rm
 CADD_rot270 01000101 .. 0 0 11011 1 . .  @rdn_rm
 SQCADD_rot9001000101 .. 0 1 11011 0 . .  @rdn_rm
 SQCADD_rot270   01000101 .. 0 1 11011 1 . .  @rdn_rm
+
+## SVE2 integer absolute difference and accumulate long
+
+SABALB  01000101 .. 0 . 1100 00 . .  @rda_rn_rm
+SABALT  01000101 .. 0 . 1100 01 . .  @rda_rn_rm
+UABALB  01000101 .. 0 . 1100 10 . .  @rda_rn_rm
+UABALT  01000101 .. 0 . 1100 11 . .  @rda_rn_rm
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index a3653007ac..a0995d95c7 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -1216,6 +1216,30 @@ DO_ZZZ_NTB(sve2_eoril_d, uint64_t, , DO_EOR)
 
 #undef DO_ZZZ_NTB
 
+#define DO_ABAL(NAME, TYPE, TYPEN) \
+void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, uint32_t desc) \
+{  \
+intptr_t i, opr_sz = simd_oprsz(desc); \
+int sel1 = (simd_data(desc) & 1) * sizeof(TYPE);   \
+int sel2 = (simd_data(desc) & 2) * (sizeof(TYPE) / 2); \
+for (i = 0; i < opr_sz; i += sizeof(TYPE)) {   \
+TYPE nn = (TYPEN)(*(TYPE *)(vn + i) >> sel1);  \
+TYPE mm = (TYPEN)(*(TYPE *)(vm + i) >> sel2);  \
+TYPE aa = *(TYPE *)(va + i);   \
+*(TYPE *)(vd + i) = DO_ABD(nn, mm) + aa;   \
+}  \
+}
+
+DO_ABAL(sve2_sabal_h, int16_t, int8_t)
+DO_ABAL(sve2_sabal_s, int32_t, int16_t)
+DO_ABAL(sve2_sabal_d, int64_t, int32_t)
+
+DO_ABAL(sve2_uabal_h, uint16_t, uint8_t)
+DO_ABAL(sve2_uabal_s, uint32_t, uint16_t)
+DO_ABAL(sve2_uabal_d, uint64_t, uint32_t)
+
+#undef DO_ABAL
+
 #define DO_BITPERM(NAME, TYPE, OP) \
 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
 {  \
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 3b0aa86e79..c6161d2ce2 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -6240,3 +6240,57 @@ static bool trans_SQCADD_rot270(DisasContext *s, 
arg_rrr_esz *a)
 {
 return do_cadd(s, a, true, true);
 }
+
+static bool do_sve2__ool(DisasContext *s, arg__esz *a,
+ gen_helper_gvec_4 *fn, int data)
+{
+if (fn == NULL || !dc_isar_feature(aa64_sve2, s)) {
+return false;
+}
+if (sve_access_check(s)) {
+unsigned vsz = vec_full_reg_size(s);
+tcg_gen_gvec_4_ool(vec_full_reg_offset(s, a->rd),
+   vec_full_reg_offset(s, a->ra),
+