Re: [PATCH 30/57] target/arm: Improve vector UQADD, UQSUB, SQADD, SQSUB

2024-05-23 Thread Richard Henderson

On 5/23/24 07:14, Peter Maydell wrote:

  void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
  {
  static const TCGOpcode vecop_list[] = {
-INDEX_op_usadd_vec, INDEX_op_cmp_vec, INDEX_op_add_vec, 0
+INDEX_op_usadd_vec, INDEX_op_add_vec, 0
  };


Why don't we need to add XOR to this list? Is it because we can
assume we have XOR-of-vector on all vector-capable hosts?


Correct.  There's an assert in tcg_can_emit_vecop_list that (1) baseline vector ops are 
*not* included in the list, and (2) all other ops *are* included in the list.



r~



Re: [PATCH 30/57] target/arm: Improve vector UQADD, UQSUB, SQADD, SQSUB

2024-05-23 Thread Peter Maydell
On Mon, 6 May 2024 at 02:07, Richard Henderson
 wrote:
>
> No need for a full comparison; xor produces non-zero bits
> for QC just fine.
>
> Signed-off-by: Richard Henderson 
> ---
>  target/arm/tcg/gengvec.c | 32 
>  1 file changed, 16 insertions(+), 16 deletions(-)
>
> diff --git a/target/arm/tcg/gengvec.c b/target/arm/tcg/gengvec.c
> index 22c9d17dce..bfe6885a01 100644
> --- a/target/arm/tcg/gengvec.c
> +++ b/target/arm/tcg/gengvec.c
> @@ -1217,21 +1217,21 @@ void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, 
> uint32_t rn_ofs,
>  tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, [vece]);
>  }
>
> -static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
> +static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
>TCGv_vec a, TCGv_vec b)
>  {
>  TCGv_vec x = tcg_temp_new_vec_matching(t);
>  tcg_gen_add_vec(vece, x, a, b);
>  tcg_gen_usadd_vec(vece, t, a, b);
> -tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t);
> -tcg_gen_or_vec(vece, sat, sat, x);
> +tcg_gen_xor_vec(vece, x, x, t);
> +tcg_gen_or_vec(vece, qc, qc, x);
>  }
>
>  void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
> uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
>  {
>  static const TCGOpcode vecop_list[] = {
> -INDEX_op_usadd_vec, INDEX_op_cmp_vec, INDEX_op_add_vec, 0
> +INDEX_op_usadd_vec, INDEX_op_add_vec, 0
>  };

Why don't we need to add XOR to this list? Is it because we can
assume we have XOR-of-vector on all vector-capable hosts?

thanks
-- PMM



[PATCH 30/57] target/arm: Improve vector UQADD, UQSUB, SQADD, SQSUB

2024-05-05 Thread Richard Henderson
No need for a full comparison; xor produces non-zero bits
for QC just fine.

Signed-off-by: Richard Henderson 
---
 target/arm/tcg/gengvec.c | 32 
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/target/arm/tcg/gengvec.c b/target/arm/tcg/gengvec.c
index 22c9d17dce..bfe6885a01 100644
--- a/target/arm/tcg/gengvec.c
+++ b/target/arm/tcg/gengvec.c
@@ -1217,21 +1217,21 @@ void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, 
uint32_t rn_ofs,
 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, [vece]);
 }
 
-static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
+static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
   TCGv_vec a, TCGv_vec b)
 {
 TCGv_vec x = tcg_temp_new_vec_matching(t);
 tcg_gen_add_vec(vece, x, a, b);
 tcg_gen_usadd_vec(vece, t, a, b);
-tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t);
-tcg_gen_or_vec(vece, sat, sat, x);
+tcg_gen_xor_vec(vece, x, x, t);
+tcg_gen_or_vec(vece, qc, qc, x);
 }
 
 void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
 {
 static const TCGOpcode vecop_list[] = {
-INDEX_op_usadd_vec, INDEX_op_cmp_vec, INDEX_op_add_vec, 0
+INDEX_op_usadd_vec, INDEX_op_add_vec, 0
 };
 static const GVecGen4 ops[4] = {
 { .fniv = gen_uqadd_vec,
@@ -1259,21 +1259,21 @@ void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, 
uint32_t rn_ofs,
rn_ofs, rm_ofs, opr_sz, max_sz, [vece]);
 }
 
-static void gen_sqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
+static void gen_sqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
   TCGv_vec a, TCGv_vec b)
 {
 TCGv_vec x = tcg_temp_new_vec_matching(t);
 tcg_gen_add_vec(vece, x, a, b);
 tcg_gen_ssadd_vec(vece, t, a, b);
-tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t);
-tcg_gen_or_vec(vece, sat, sat, x);
+tcg_gen_xor_vec(vece, x, x, t);
+tcg_gen_or_vec(vece, qc, qc, x);
 }
 
 void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
 {
 static const TCGOpcode vecop_list[] = {
-INDEX_op_ssadd_vec, INDEX_op_cmp_vec, INDEX_op_add_vec, 0
+INDEX_op_ssadd_vec, INDEX_op_add_vec, 0
 };
 static const GVecGen4 ops[4] = {
 { .fniv = gen_sqadd_vec,
@@ -1301,21 +1301,21 @@ void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, 
uint32_t rn_ofs,
rn_ofs, rm_ofs, opr_sz, max_sz, [vece]);
 }
 
-static void gen_uqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
+static void gen_uqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
   TCGv_vec a, TCGv_vec b)
 {
 TCGv_vec x = tcg_temp_new_vec_matching(t);
 tcg_gen_sub_vec(vece, x, a, b);
 tcg_gen_ussub_vec(vece, t, a, b);
-tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t);
-tcg_gen_or_vec(vece, sat, sat, x);
+tcg_gen_xor_vec(vece, x, x, t);
+tcg_gen_or_vec(vece, qc, qc, x);
 }
 
 void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
 {
 static const TCGOpcode vecop_list[] = {
-INDEX_op_ussub_vec, INDEX_op_cmp_vec, INDEX_op_sub_vec, 0
+INDEX_op_ussub_vec, INDEX_op_sub_vec, 0
 };
 static const GVecGen4 ops[4] = {
 { .fniv = gen_uqsub_vec,
@@ -1343,21 +1343,21 @@ void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, 
uint32_t rn_ofs,
rn_ofs, rm_ofs, opr_sz, max_sz, [vece]);
 }
 
-static void gen_sqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
+static void gen_sqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
   TCGv_vec a, TCGv_vec b)
 {
 TCGv_vec x = tcg_temp_new_vec_matching(t);
 tcg_gen_sub_vec(vece, x, a, b);
 tcg_gen_sssub_vec(vece, t, a, b);
-tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t);
-tcg_gen_or_vec(vece, sat, sat, x);
+tcg_gen_xor_vec(vece, x, x, t);
+tcg_gen_or_vec(vece, qc, qc, x);
 }
 
 void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
 {
 static const TCGOpcode vecop_list[] = {
-INDEX_op_sssub_vec, INDEX_op_cmp_vec, INDEX_op_sub_vec, 0
+INDEX_op_sssub_vec, INDEX_op_sub_vec, 0
 };
 static const GVecGen4 ops[4] = {
 { .fniv = gen_sqsub_vec,
-- 
2.34.1