Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation

2013-11-18 Thread Claudio Fontana
Hello,

On 09/27/2013 08:25 PM, Richard Henderson wrote:
 On 09/26/2013 05:48 PM, Alexander Graf wrote:
 This patch adds emulation support for the orr instruction.

 Signed-off-by: Alexander Graf ag...@suse.de
 ---
  target-arm/helper-a64.c|  28 +++
  target-arm/helper-a64.h|   1 +
  target-arm/translate-a64.c | 120 
 +
  3 files changed, 149 insertions(+)

 diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c
 index 8105fb5..da72b7f 100644
 --- a/target-arm/helper-a64.c
 +++ b/target-arm/helper-a64.c
 @@ -24,3 +24,31 @@
  #include sysemu/sysemu.h
  #include qemu/bitops.h
  
 +uint32_t HELPER(pstate_add)(uint32_t pstate, uint64_t a1, uint64_t a2,
 +uint64_t ar)
 +{
 +int64_t s1 = a1;
 +int64_t s2 = a2;
 +int64_t sr = ar;
 +
 +pstate = ~(PSTATE_N | PSTATE_Z | PSTATE_C | PSTATE_V);
 +
 +if (sr  0) {
 +pstate |= PSTATE_N;
 +}
 +
 +if (!ar) {
 +pstate |= PSTATE_Z;
 +}
 +
 +if (ar  (ar  a1)) {
 +pstate |= PSTATE_C;
 +}
 +
 +if ((s1  0  s2  0  sr  0) ||
 +(s1  0  s2  0  sr  0)) {
 +pstate |= PSTATE_V;
 +}
 +
 +return pstate;
 +}
 
 Why are you not using the same split apart bits as A32?
 
 +/* XXX carry_out */
 +switch (shift_type) {
 
 What carry out?  I see no such in the ShiftReg description.
 
 +case 3:
 +tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift);
 +break;
 
 Incorrect rotate for 32bit?
 
 +static void handle_orr(DisasContext *s, uint32_t insn)
 +{
 +int is_32bit = !get_bits(insn, 31, 1);
 +int dest = get_reg(insn);
 +int source = get_bits(insn, 5, 5);
 +int rm = get_bits(insn, 16, 5);
 +int shift_amount = get_sbits(insn, 10, 6);
 +int is_n = get_bits(insn, 21, 1);
 +int shift_type = get_bits(insn, 22, 2);
 +int opc = get_bits(insn, 29, 2);
 +bool setflags = (opc == 0x3);
 +TCGv_i64 tcg_op2;
 +TCGv_i64 tcg_dest;
 +
 +if (is_32bit  (shift_amount  0)) {
 +/* reserved value */
 +unallocated_encoding(s);
 +}
 
 Why are you extracting shift_amount signed?
 
 +
 +/* MOV is dest = xzr  (source  ~0) */
 
 Comment is wrong.
 
 +if (!shift_amount  source == 0x1f) {

Besides the comment, is this correct?
I am trying to rework this patch, but this part seems incorrect to me.

We land here for the AND as well, and if source(rn) is xzr,
then I would expect the result to be zero for AND regardless of anything else,
and not a MOV.

Can we really do this optimization in general here for AND, OR, EOR?

Thanks for any clarification,

Claudio

 +if (is_32bit) {
 +tcg_gen_ext32u_i64(cpu_reg_sp(dest), cpu_reg(rm));
 +} else {
 +tcg_gen_mov_i64(cpu_reg_sp(dest), cpu_reg(rm));
 +}
 +if (is_n) {
 +tcg_gen_not_i64(cpu_reg_sp(dest), cpu_reg_sp(dest));
 +}
 +if (is_32bit) {
 +tcg_gen_ext32u_i64(cpu_reg_sp(dest), cpu_reg_sp(dest));
 +}
 
 These are incorrect -- no sp in the logical ops, but xzr instead.
 
 And surely we can emit fewer opcodes for the simple cases here.
 Since these are the canonical aliases for mov/mvn, it'll pay off.
 
 TCGv src = cpu_reg(rm);
 TCGv dst = cpu_reg(rd);
 
 if (is_n) {
 tcg_gen_not_i64(dst, src);
 src = dst;
 }
 if (is_32bit) {
 tcg_gen_ext32u_i64(dst, src);
 } else {
 tcg_gen_mov_i64(dst, src);
 }
 
 Note that tcg_gen_mov_i64 does the src == dst check, so a simple
 64-bit mvn will only emit the not.
 
 
 +tcg_dest = cpu_reg(dest);
 +switch (opc) {
 +case 0x0:
 +case 0x3:
 +tcg_gen_and_i64(tcg_dest, cpu_reg(source), tcg_op2);
 +break;
 +case 0x1:
 +tcg_gen_or_i64(tcg_dest, cpu_reg(source), tcg_op2);
 +break;
 +case 0x2:
 +tcg_gen_xor_i64(tcg_dest, cpu_reg(source), tcg_op2);
 +break;
 +}
 +
 +if (is_32bit) {
 +tcg_gen_ext32u_i64(tcg_dest, tcg_dest);
 +}
 +
 +if (setflags) {
 +gen_helper_pstate_add(pstate, pstate, tcg_dest, cpu_reg(31), 
 tcg_dest);
 +}
 
 Incorrect flags generated.  They're different between add/sub and logical.
 In particular, C and V are always zero.
 
 +handle_orr(s, insn);
 
 And please use a more proper name than ORR for something that handles all
 of the logical insns.
 
 
 r~
 




Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation

2013-11-18 Thread Laurent Desnogues
On Mon, Nov 18, 2013 at 11:15 AM, Claudio Fontana
claudio.font...@linaro.org wrote:
 Hello,

 On 09/27/2013 08:25 PM, Richard Henderson wrote:
 On 09/26/2013 05:48 PM, Alexander Graf wrote:
 This patch adds emulation support for the orr instruction.

 Signed-off-by: Alexander Graf ag...@suse.de
 ---
  target-arm/helper-a64.c|  28 +++
  target-arm/helper-a64.h|   1 +
  target-arm/translate-a64.c | 120 
 +
  3 files changed, 149 insertions(+)

 diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c
 index 8105fb5..da72b7f 100644
 --- a/target-arm/helper-a64.c
 +++ b/target-arm/helper-a64.c
 @@ -24,3 +24,31 @@
  #include sysemu/sysemu.h
  #include qemu/bitops.h

 +uint32_t HELPER(pstate_add)(uint32_t pstate, uint64_t a1, uint64_t a2,
 +uint64_t ar)
 +{
 +int64_t s1 = a1;
 +int64_t s2 = a2;
 +int64_t sr = ar;
 +
 +pstate = ~(PSTATE_N | PSTATE_Z | PSTATE_C | PSTATE_V);
 +
 +if (sr  0) {
 +pstate |= PSTATE_N;
 +}
 +
 +if (!ar) {
 +pstate |= PSTATE_Z;
 +}
 +
 +if (ar  (ar  a1)) {
 +pstate |= PSTATE_C;
 +}
 +
 +if ((s1  0  s2  0  sr  0) ||
 +(s1  0  s2  0  sr  0)) {
 +pstate |= PSTATE_V;
 +}
 +
 +return pstate;
 +}

 Why are you not using the same split apart bits as A32?

 +/* XXX carry_out */
 +switch (shift_type) {

 What carry out?  I see no such in the ShiftReg description.

 +case 3:
 +tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift);
 +break;

 Incorrect rotate for 32bit?

 +static void handle_orr(DisasContext *s, uint32_t insn)
 +{
 +int is_32bit = !get_bits(insn, 31, 1);
 +int dest = get_reg(insn);
 +int source = get_bits(insn, 5, 5);
 +int rm = get_bits(insn, 16, 5);
 +int shift_amount = get_sbits(insn, 10, 6);
 +int is_n = get_bits(insn, 21, 1);
 +int shift_type = get_bits(insn, 22, 2);
 +int opc = get_bits(insn, 29, 2);
 +bool setflags = (opc == 0x3);
 +TCGv_i64 tcg_op2;
 +TCGv_i64 tcg_dest;
 +
 +if (is_32bit  (shift_amount  0)) {
 +/* reserved value */
 +unallocated_encoding(s);
 +}

 Why are you extracting shift_amount signed?

 +
 +/* MOV is dest = xzr  (source  ~0) */

 Comment is wrong.

 +if (!shift_amount  source == 0x1f) {

 Besides the comment, is this correct?
 I am trying to rework this patch, but this part seems incorrect to me.

 We land here for the AND as well, and if source(rn) is xzr,
 then I would expect the result to be zero for AND regardless of anything else,
 and not a MOV.

 Can we really do this optimization in general here for AND, OR, EOR?

That part is definitely wrong:  there's a missing check that opc = 1
(ORR/ORN for MOV/MVN).  The comment also is very wrong :-)

Also note that SP can't be accessed by the shifted reg logical ops as
Richard wrote.


Laurent

 Thanks for any clarification,

 Claudio

 +if (is_32bit) {
 +tcg_gen_ext32u_i64(cpu_reg_sp(dest), cpu_reg(rm));
 +} else {
 +tcg_gen_mov_i64(cpu_reg_sp(dest), cpu_reg(rm));
 +}
 +if (is_n) {
 +tcg_gen_not_i64(cpu_reg_sp(dest), cpu_reg_sp(dest));
 +}
 +if (is_32bit) {
 +tcg_gen_ext32u_i64(cpu_reg_sp(dest), cpu_reg_sp(dest));
 +}

 These are incorrect -- no sp in the logical ops, but xzr instead.

 And surely we can emit fewer opcodes for the simple cases here.
 Since these are the canonical aliases for mov/mvn, it'll pay off.

 TCGv src = cpu_reg(rm);
 TCGv dst = cpu_reg(rd);

 if (is_n) {
 tcg_gen_not_i64(dst, src);
 src = dst;
 }
 if (is_32bit) {
 tcg_gen_ext32u_i64(dst, src);
 } else {
 tcg_gen_mov_i64(dst, src);
 }

 Note that tcg_gen_mov_i64 does the src == dst check, so a simple
 64-bit mvn will only emit the not.


 +tcg_dest = cpu_reg(dest);
 +switch (opc) {
 +case 0x0:
 +case 0x3:
 +tcg_gen_and_i64(tcg_dest, cpu_reg(source), tcg_op2);
 +break;
 +case 0x1:
 +tcg_gen_or_i64(tcg_dest, cpu_reg(source), tcg_op2);
 +break;
 +case 0x2:
 +tcg_gen_xor_i64(tcg_dest, cpu_reg(source), tcg_op2);
 +break;
 +}
 +
 +if (is_32bit) {
 +tcg_gen_ext32u_i64(tcg_dest, tcg_dest);
 +}
 +
 +if (setflags) {
 +gen_helper_pstate_add(pstate, pstate, tcg_dest, cpu_reg(31), 
 tcg_dest);
 +}

 Incorrect flags generated.  They're different between add/sub and logical.
 In particular, C and V are always zero.

 +handle_orr(s, insn);

 And please use a more proper name than ORR for something that handles all
 of the logical insns.


 r~





Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation

2013-11-18 Thread Michael Matz
Hi,

On Mon, 18 Nov 2013, Claudio Fontana wrote:

  +case 3:
  +tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift);
  +break;
  
  Incorrect rotate for 32bit?

32bit rotates and shifts were fixed in a patch later than the 60er series 
Alex posted.  See attached.  (Generally there are many fixes to emulated 
instructions in that branch)

  +if (!shift_amount  source == 0x1f) {
 
 Besides the comment, is this correct?

No, it needs to check for opc == 1.

  +tcg_dest = cpu_reg(dest);
  +switch (opc) {
  +case 0x0:
  +case 0x3:
  +tcg_gen_and_i64(tcg_dest, cpu_reg(source), tcg_op2);
  +break;
  +case 0x1:
  +tcg_gen_or_i64(tcg_dest, cpu_reg(source), tcg_op2);
  +break;
  +case 0x2:
  +tcg_gen_xor_i64(tcg_dest, cpu_reg(source), tcg_op2);
  +break;
  +}
  +
  +if (is_32bit) {
  +tcg_gen_ext32u_i64(tcg_dest, tcg_dest);
  +}
  +
  +if (setflags) {
  +gen_helper_pstate_add(pstate, pstate, tcg_dest, cpu_reg(31), 
  tcg_dest);
  +}
  
  Incorrect flags generated.  They're different between add/sub and logical.
  In particular, C and V are always zero.

That's done correctly with the fixed pstate helpers coming with a later 
patch (see attached as well).  reg31 is zero, so that's flags as if for 
dest == dest + 0, and PSTATE_C and PSTATE_V will be zero.  That is, the 
logical flags are the same as the arithmetic flags for result plus zero 
with no carry_in.


Ciao,
Michael.From df54486da31d6329696effa61096eda5ab85395a Mon Sep 17 00:00:00 2001
From: Michael Matz m...@suse.de
Date: Sun, 24 Mar 2013 02:52:42 +0100
Subject: [PATCH] Fix 32bit rotates.

The 32bit shifts generally weren't careful with the upper parts,
either bits could leak in (for right shift) or leak or (for left shift).
And rotate was completely off, rotating around bit 63, not 31.
This fixes the CAST5 hash algorithm.
---
 target-arm/translate-a64.c | 30 +++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 96dc281..e3941a1 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -596,25 +596,49 @@ static TCGv_i64 get_shift(int reg, int shift_type, TCGv_i64 tcg_shift,
 r = tcg_temp_new_i64();
 
 /* XXX carry_out */
+/* Careful with the width.  We work on 64bit, but must make sure
+   that we zero-extend the result on out, and ignore any upper bits,
+   that might still be set in that register.  */
 switch (shift_type) {
 case 0: /* LSL */
+	/* left shift is easy, simply zero-extend on out */
 tcg_gen_shl_i64(r, cpu_reg(reg), tcg_shift);
+	if (is_32bit)
+	  tcg_gen_ext32u_i64 (r, r);
 break;
 case 1: /* LSR */
-tcg_gen_shr_i64(r, cpu_reg(reg), tcg_shift);
+	/* For logical right shift we zero extend first, to zero
+	   the upper bits.  We don't need to extend on out.  */
+	if (is_32bit) {
+	tcg_gen_ext32u_i64 (r, cpu_reg(reg));
+	tcg_gen_shr_i64 (r, r, tcg_shift);
+	} else
+	  tcg_gen_shr_i64(r, cpu_reg(reg), tcg_shift);
 break;
 case 2: /* ASR */
+	/* For arithmetic right shift we sign extend first, then shift,
+	   and then need to clear the upper bits again.  */
 if (is_32bit) {
 TCGv_i64 tcg_tmp = tcg_temp_new_i64();
 tcg_gen_ext32s_i64(tcg_tmp, cpu_reg(reg));
 tcg_gen_sar_i64(r, tcg_tmp, tcg_shift);
+	tcg_gen_ext32u_i64 (r, r);
 tcg_temp_free_i64(tcg_tmp);
 } else {
 tcg_gen_sar_i64(r, cpu_reg(reg), tcg_shift);
 }
 break;
-case 3:
-tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift);
+case 3: /* ROR */
+	/* For rotation extending doesn't help, we really have to use
+	   a 32bit rotate.  */
+	if (is_32bit) {
+	TCGv_i32 tmp = tcg_temp_new_i32();
+tcg_gen_trunc_i64_i32(tmp, cpu_reg(reg));
+	tcg_gen_rotr_i32(tmp, tmp, tcg_shift);
+tcg_gen_extu_i32_i64(r, tmp);
+tcg_temp_free_i32(tmp);
+	} else
+	  tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift);
 break;
 }
 
-- 
1.8.1.4

From 33137f8a660750d7d8598c7e467f4ccc8dc5ef85 Mon Sep 17 00:00:00 2001
From: Michael Matz m...@suse.de
Date: Sat, 23 Mar 2013 04:53:44 +0100
Subject: [PATCH] Fix the pstate flags helpers

ADCS and SBCS/SUBS sometimes gave the wrong results
for the C and V flags.  This fixes it.
---
 target-arm/helper-a64.c | 52 -
 1 file changed, 12 insertions(+), 40 deletions(-)

diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c
index 4375bf0..4fcb09b 100644
--- a/target-arm/helper-a64.c
+++ b/target-arm/helper-a64.c
@@ -7,8 +7,6 @@
 
 uint32_t HELPER(pstate_add)(uint32_t pstate, uint64_t a1, uint64_t a2, uint64_t ar)
 {
-int64_t s1 = a1;
-int64_t s2 = a2;
 int64_t sr = ar;
 
 pstate = ~(PSTATE_N | PSTATE_Z | PSTATE_C | PSTATE_V);
@@ -21,11 +19,15 @@ 

Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation

2013-11-18 Thread Peter Maydell
On 18 November 2013 13:12, Michael Matz m...@suse.de wrote:
 Hi,

 On Mon, 18 Nov 2013, Claudio Fontana wrote:

  +case 3:
  +tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift);
  +break;
 
  Incorrect rotate for 32bit?

 32bit rotates and shifts were fixed in a patch later than the 60er series
 Alex posted.  See attached.  (Generally there are many fixes to emulated
 instructions in that branch)

I think we're going to need to look through and fold in those
fixes, otherwise we'll end up reduplicating that work in the
course of code review :-(

-- PMM



Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation

2013-11-18 Thread Claudio Fontana
On 11/18/2013 02:15 PM, Peter Maydell wrote:
 On 18 November 2013 13:12, Michael Matz m...@suse.de wrote:
 Hi,

 On Mon, 18 Nov 2013, Claudio Fontana wrote:

 +case 3:
 +tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift);
 +break;

 Incorrect rotate for 32bit?

 32bit rotates and shifts were fixed in a patch later than the 60er series
 Alex posted.  See attached.  (Generally there are many fixes to emulated
 instructions in that branch)
 
 I think we're going to need to look through and fold in those
 fixes, otherwise we'll end up reduplicating that work in the
 course of code review :-(
 
 -- PMM
 

Thanks all.

Regarding the access to registers in 32 bit mode, and the consequent write to 
registers in 32 bit mode,
I am investigating how to do it a little bit more general, in the sense that 
generally when we access registers in 32bit mode
we will (often) need to ignore the upper bits of the source register, and write 
zero to the destination register.
Not always but often. This could be done once for all to reduce the chances of 
mistakes.

C.





Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation

2013-11-18 Thread Claudio Fontana
Btw, in the first patch:

On 11/18/2013 02:12 PM, Michael Matz wrote:
 
 From df54486da31d6329696effa61096eda5ab85395a Mon Sep 17 00:00:00 2001
 From: Michael Matz m...@suse.de
 Date: Sun, 24 Mar 2013 02:52:42 +0100
 Subject: [PATCH] Fix 32bit rotates.
 
 The 32bit shifts generally weren't careful with the upper parts,
 either bits could leak in (for right shift) or leak or (for left shift).
 And rotate was completely off, rotating around bit 63, not 31.
 This fixes the CAST5 hash algorithm.
 ---
  target-arm/translate-a64.c | 30 +++---
  1 file changed, 27 insertions(+), 3 deletions(-)
 
 diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
 index 96dc281..e3941a1 100644
 --- a/target-arm/translate-a64.c
 +++ b/target-arm/translate-a64.c
 @@ -596,25 +596,49 @@ static TCGv_i64 get_shift(int reg, int shift_type, 
 TCGv_i64 tcg_shift,
  r = tcg_temp_new_i64();
  
  /* XXX carry_out */
 +/* Careful with the width.  We work on 64bit, but must make sure
 +   that we zero-extend the result on out, and ignore any upper bits,
 +   that might still be set in that register.  */
  switch (shift_type) {
  case 0: /* LSL */
 + /* left shift is easy, simply zero-extend on out */
  tcg_gen_shl_i64(r, cpu_reg(reg), tcg_shift);
 + if (is_32bit)
 +   tcg_gen_ext32u_i64 (r, r);
  break;
  case 1: /* LSR */
 -tcg_gen_shr_i64(r, cpu_reg(reg), tcg_shift);
 + /* For logical right shift we zero extend first, to zero
 +the upper bits.  We don't need to extend on out.  */
 + if (is_32bit) {
 + tcg_gen_ext32u_i64 (r, cpu_reg(reg));
 + tcg_gen_shr_i64 (r, r, tcg_shift);
 + } else
 +   tcg_gen_shr_i64(r, cpu_reg(reg), tcg_shift);
  break;
  case 2: /* ASR */
 + /* For arithmetic right shift we sign extend first, then shift,
 +and then need to clear the upper bits again.  */
  if (is_32bit) {
  TCGv_i64 tcg_tmp = tcg_temp_new_i64();
  tcg_gen_ext32s_i64(tcg_tmp, cpu_reg(reg));
  tcg_gen_sar_i64(r, tcg_tmp, tcg_shift);
 + tcg_gen_ext32u_i64 (r, r);
  tcg_temp_free_i64(tcg_tmp);
  } else {
  tcg_gen_sar_i64(r, cpu_reg(reg), tcg_shift);
  }
  break;
 -case 3:
 -tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift);
 +case 3: /* ROR */
 + /* For rotation extending doesn't help, we really have to use
 +a 32bit rotate.  */
 + if (is_32bit) {
 + TCGv_i32 tmp = tcg_temp_new_i32();
 +tcg_gen_trunc_i64_i32(tmp, cpu_reg(reg));
 + tcg_gen_rotr_i32(tmp, tmp, tcg_shift);

Isn't this problematic?
We are using gen_rotr_i32, but passing tcg_shift, which is a TCGv_i64.
I remember I had compilation failures in the past when I tried something 
similar,
so my understanding is that this can work with a certain compiler under certain 
compiler options,
but is not guaranteed to work in all cases.

I think we need to either explicitly convert the tcg_shift to a TCGv_i32, or we 
need to use an open coded version of the rotr_i64 that inserts at (32 - n) 
instead of (64 - n)

What do you think?

C.

 +tcg_gen_extu_i32_i64(r, tmp);
 +tcg_temp_free_i32(tmp);
 + } else
 +   tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift);
  break;
  }
  
 -- 1.8.1.4
 




Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation

2013-11-18 Thread Peter Maydell
On 18 November 2013 13:43, Claudio Fontana claudio.font...@linaro.org wrote:
 We are using gen_rotr_i32, but passing tcg_shift, which is a TCGv_i64.
 I remember I had compilation failures in the past when I tried something 
 similar,
 so my understanding is that this can work with a certain compiler under 
 certain compiler options,
 but is not guaranteed to work in all cases.

It's a debug option -- if you build with --enable-debug
then TCGv_i32/TCGv_i64 mismatches should always cause
compile failures.

-- PMM



Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation

2013-11-18 Thread Michael Matz
Hi,

On Mon, 18 Nov 2013, Peter Maydell wrote:

   +case 3:
   +tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift);
   +break;
  
   Incorrect rotate for 32bit?
 
  32bit rotates and shifts were fixed in a patch later than the 60er series
  Alex posted.  See attached.  (Generally there are many fixes to emulated
  instructions in that branch)
 
 I think we're going to need to look through and fold in those fixes, 
 otherwise we'll end up reduplicating that work in the course of code 
 review :-(

Most probably.  Authorship will be lost :-/ I was planning to submit all 
of these once the 60er set of Alex would be applied.  If you're reworking 
that set more of less completely then it indeed makes more sense to fold 
in those things and so it'd probably be better to have them applied (i.e. 
basically have our full branch applied when dissecting things).

The commits that fix things in the a64 decoder proper (not the linux-user 
implementation) would be:

e14c1a5 softfloat: correctly handle overflow in float[32|64] to uint64 
conversion
cbc98b1 aarch64: Fix FCVTZU for single float
a91f762 aarch64: Fix UZP/ZIP/TRN
644c748 aarch64: Fix 32bit TST
2a717e8 Fix FCVTAS and FCVTAU
0dd22d0 Fix decoding of floating-fixed conversions
d52c999 Fix implementation of USHLL/SSHLL
cfbb9e1 Fix using uninitialized value
ecfdfcd Fix typo in FSUB detection
87fd8ca Increase MAX_OP_PER_INSTR
38452d8 Fix USHLL, and implement other SIMD shifts
4146d40 Fix INS element
a62437c Fix fcmp(e) with NaNs
ec2b8f3 softfloat: Fix float64_to_uint64
b003867 Fix EXTR for 32bit
df54486 Fix 32bit rotates.
33137f8 Fix the pstate flags helpers
75cb838 Don't set flush to zero by default
564e811 Fix 128bit ldr (literal)
0ff91a0 Fix long immediate constants

(That's all on top Alex' posted patchset but not git rebased onto it, top 
of Alex roughly corresponds to commit 40d66b61)


Ciao,
Michael.



Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation

2013-11-18 Thread Peter Maydell
On 18 November 2013 13:46, Michael Matz m...@suse.de wrote:
 On Mon, 18 Nov 2013, Peter Maydell wrote:
 I think we're going to need to look through and fold in those fixes,
 otherwise we'll end up reduplicating that work in the course of code
 review :-(

 Most probably.  Authorship will be lost :-/ I was planning to submit all
 of these once the 60er set of Alex would be applied.  If you're reworking
 that set more of less completely then it indeed makes more sense to fold
 in those things and so it'd probably be better to have them applied (i.e.
 basically have our full branch applied when dissecting things).

 The commits that fix things in the a64 decoder proper (not the linux-user
 implementation) would be:

 e14c1a5 softfloat: correctly handle overflow in float[32|64] to uint64 
 conversion
 cbc98b1 aarch64: Fix FCVTZU for single float
 a91f762 aarch64: Fix UZP/ZIP/TRN
 644c748 aarch64: Fix 32bit TST
 2a717e8 Fix FCVTAS and FCVTAU
 0dd22d0 Fix decoding of floating-fixed conversions
 d52c999 Fix implementation of USHLL/SSHLL
 cfbb9e1 Fix using uninitialized value
 ecfdfcd Fix typo in FSUB detection
 87fd8ca Increase MAX_OP_PER_INSTR
 38452d8 Fix USHLL, and implement other SIMD shifts
 4146d40 Fix INS element
 a62437c Fix fcmp(e) with NaNs
 ec2b8f3 softfloat: Fix float64_to_uint64
 b003867 Fix EXTR for 32bit
 df54486 Fix 32bit rotates.
 33137f8 Fix the pstate flags helpers
 75cb838 Don't set flush to zero by default
 564e811 Fix 128bit ldr (literal)
 0ff91a0 Fix long immediate constants

This looks like a small enough list to be tractable to fold
in. My suggestion for authorship would be that we have the
'From' line indicate whoever wrote the bulk of the code and
add sign-off lines from both of you.

(Some of those, like the softfloat fixes, are probably
standalone patches anyway.)

thanks
-- PMM



Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation

2013-11-18 Thread Michael Matz
Hi,

On Mon, 18 Nov 2013, Claudio Fontana wrote:

  +tcg_gen_trunc_i64_i32(tmp, cpu_reg(reg));
  +   tcg_gen_rotr_i32(tmp, tmp, tcg_shift);
 
 Isn't this problematic?
 We are using gen_rotr_i32, but passing tcg_shift, which is a TCGv_i64.

With CONFIG_DEBUG_TCG it'll break, yes.  Though in principle there's no 
canonical relation between the two argument types for shifts and rotates 
(unlike addition for example) TCG indeed wants to ensure that 
typeof(arg2)==typeof(arg1).

 I remember I had compilation failures in the past when I tried something 
 similar, so my understanding is that this can work with a certain 
 compiler under certain compiler options, but is not guaranteed to work 
 in all cases.
 
 I think we need to either explicitly convert the tcg_shift to a 
 TCGv_i32, or we need to use an open coded version of the rotr_i64 that 
 inserts at (32 - n) instead of (64 - n)
 
 What do you think?

I think converting tcg_shift might eventually lead to better generated 
code (if tcg is optmizing enough, now or in the future, haven't checked).


Ciao,
Michael.



Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation

2013-11-18 Thread Richard Henderson
On 11/18/2013 11:55 PM, Michael Matz wrote:
  I think we need to either explicitly convert the tcg_shift to a 
  TCGv_i32, or we need to use an open coded version of the rotr_i64 that 
  inserts at (32 - n) instead of (64 - n)
  
  What do you think?
 I think converting tcg_shift might eventually lead to better generated 
 code (if tcg is optmizing enough, now or in the future, haven't checked).

Agreed.


r~



Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation

2013-10-30 Thread Alexander Graf

On 27.09.2013, at 11:25, Richard Henderson r...@twiddle.net wrote:

 On 09/26/2013 05:48 PM, Alexander Graf wrote:
 This patch adds emulation support for the orr instruction.
 
 Signed-off-by: Alexander Graf ag...@suse.de
 ---
 target-arm/helper-a64.c|  28 +++
 target-arm/helper-a64.h|   1 +
 target-arm/translate-a64.c | 120 
 +
 3 files changed, 149 insertions(+)
 
 diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c
 index 8105fb5..da72b7f 100644
 --- a/target-arm/helper-a64.c
 +++ b/target-arm/helper-a64.c
 @@ -24,3 +24,31 @@
 #include sysemu/sysemu.h
 #include qemu/bitops.h
 
 +uint32_t HELPER(pstate_add)(uint32_t pstate, uint64_t a1, uint64_t a2,
 +uint64_t ar)
 +{
 +int64_t s1 = a1;
 +int64_t s2 = a2;
 +int64_t sr = ar;
 +
 +pstate = ~(PSTATE_N | PSTATE_Z | PSTATE_C | PSTATE_V);
 +
 +if (sr  0) {
 +pstate |= PSTATE_N;
 +}
 +
 +if (!ar) {
 +pstate |= PSTATE_Z;
 +}
 +
 +if (ar  (ar  a1)) {
 +pstate |= PSTATE_C;
 +}
 +
 +if ((s1  0  s2  0  sr  0) ||
 +(s1  0  s2  0  sr  0)) {
 +pstate |= PSTATE_V;
 +}
 +
 +return pstate;
 +}
 
 Why are you not using the same split apart bits as A32?

There is an architecturally defined register that specifies what pstate looks 
like and IIRC that includes system level state as well, similar to EFLAGS. So I 
figured it's more straight forward to use a single variable for it.

I don't think it really makes much of a difference either way though. If we see 
that doing it in a split way makes more sense we can always just switch to that 
later.


Alex




Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation

2013-10-30 Thread Peter Maydell
On 31 October 2013 00:29, Alexander Graf ag...@suse.de wrote:

 On 27.09.2013, at 11:25, Richard Henderson r...@twiddle.net wrote:
 Why are you not using the same split apart bits as A32?

 There is an architecturally defined register that specifies
 what pstate looks like and IIRC that includes system level
 state as well, similar to EFLAGS.

No, the architecture goes out of its way to point out that pstate
is not a register. There are a collection of different state bits
which are generally accessible via different MSR/MRS
instructions or in some cases not accessible at all. This
is a difference from A32.

In any case as Richard says we already split NZCV from
the rest of CPSR in A32 -- the few places that want a
complete 32 bit CPSR call a helper function that assembles
it from the various separate parts.

 I don't think it really makes much of a difference either way
 though. If we see that doing it in a split way makes more
 sense we can always just switch to that later.

I think the split is less critical for A64 because of the
severely reduced conditionality which means we're less
likely to be making frequent NZCV checks. However it's
probably still worth having because it's pretty nearly free.

-- PMM



Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation

2013-09-27 Thread Richard Henderson
On 09/26/2013 05:48 PM, Alexander Graf wrote:
 This patch adds emulation support for the orr instruction.
 
 Signed-off-by: Alexander Graf ag...@suse.de
 ---
  target-arm/helper-a64.c|  28 +++
  target-arm/helper-a64.h|   1 +
  target-arm/translate-a64.c | 120 
 +
  3 files changed, 149 insertions(+)
 
 diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c
 index 8105fb5..da72b7f 100644
 --- a/target-arm/helper-a64.c
 +++ b/target-arm/helper-a64.c
 @@ -24,3 +24,31 @@
  #include sysemu/sysemu.h
  #include qemu/bitops.h
  
 +uint32_t HELPER(pstate_add)(uint32_t pstate, uint64_t a1, uint64_t a2,
 +uint64_t ar)
 +{
 +int64_t s1 = a1;
 +int64_t s2 = a2;
 +int64_t sr = ar;
 +
 +pstate = ~(PSTATE_N | PSTATE_Z | PSTATE_C | PSTATE_V);
 +
 +if (sr  0) {
 +pstate |= PSTATE_N;
 +}
 +
 +if (!ar) {
 +pstate |= PSTATE_Z;
 +}
 +
 +if (ar  (ar  a1)) {
 +pstate |= PSTATE_C;
 +}
 +
 +if ((s1  0  s2  0  sr  0) ||
 +(s1  0  s2  0  sr  0)) {
 +pstate |= PSTATE_V;
 +}
 +
 +return pstate;
 +}

Why are you not using the same split apart bits as A32?

 +/* XXX carry_out */
 +switch (shift_type) {

What carry out?  I see no such in the ShiftReg description.

 +case 3:
 +tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift);
 +break;

Incorrect rotate for 32bit?

 +static void handle_orr(DisasContext *s, uint32_t insn)
 +{
 +int is_32bit = !get_bits(insn, 31, 1);
 +int dest = get_reg(insn);
 +int source = get_bits(insn, 5, 5);
 +int rm = get_bits(insn, 16, 5);
 +int shift_amount = get_sbits(insn, 10, 6);
 +int is_n = get_bits(insn, 21, 1);
 +int shift_type = get_bits(insn, 22, 2);
 +int opc = get_bits(insn, 29, 2);
 +bool setflags = (opc == 0x3);
 +TCGv_i64 tcg_op2;
 +TCGv_i64 tcg_dest;
 +
 +if (is_32bit  (shift_amount  0)) {
 +/* reserved value */
 +unallocated_encoding(s);
 +}

Why are you extracting shift_amount signed?

 +
 +/* MOV is dest = xzr  (source  ~0) */

Comment is wrong.

 +if (!shift_amount  source == 0x1f) {
 +if (is_32bit) {
 +tcg_gen_ext32u_i64(cpu_reg_sp(dest), cpu_reg(rm));
 +} else {
 +tcg_gen_mov_i64(cpu_reg_sp(dest), cpu_reg(rm));
 +}
 +if (is_n) {
 +tcg_gen_not_i64(cpu_reg_sp(dest), cpu_reg_sp(dest));
 +}
 +if (is_32bit) {
 +tcg_gen_ext32u_i64(cpu_reg_sp(dest), cpu_reg_sp(dest));
 +}

These are incorrect -- no sp in the logical ops, but xzr instead.

And surely we can emit fewer opcodes for the simple cases here.
Since these are the canonical aliases for mov/mvn, it'll pay off.

TCGv src = cpu_reg(rm);
TCGv dst = cpu_reg(rd);

if (is_n) {
tcg_gen_not_i64(dst, src);
src = dst;
}
if (is_32bit) {
tcg_gen_ext32u_i64(dst, src);
} else {
tcg_gen_mov_i64(dst, src);
}

Note that tcg_gen_mov_i64 does the src == dst check, so a simple
64-bit mvn will only emit the not.


 +tcg_dest = cpu_reg(dest);
 +switch (opc) {
 +case 0x0:
 +case 0x3:
 +tcg_gen_and_i64(tcg_dest, cpu_reg(source), tcg_op2);
 +break;
 +case 0x1:
 +tcg_gen_or_i64(tcg_dest, cpu_reg(source), tcg_op2);
 +break;
 +case 0x2:
 +tcg_gen_xor_i64(tcg_dest, cpu_reg(source), tcg_op2);
 +break;
 +}
 +
 +if (is_32bit) {
 +tcg_gen_ext32u_i64(tcg_dest, tcg_dest);
 +}
 +
 +if (setflags) {
 +gen_helper_pstate_add(pstate, pstate, tcg_dest, cpu_reg(31), 
 tcg_dest);
 +}

Incorrect flags generated.  They're different between add/sub and logical.
In particular, C and V are always zero.

 +handle_orr(s, insn);

And please use a more proper name than ORR for something that handles all
of the logical insns.


r~



[Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation

2013-09-26 Thread Alexander Graf
This patch adds emulation support for the orr instruction.

Signed-off-by: Alexander Graf ag...@suse.de
---
 target-arm/helper-a64.c|  28 +++
 target-arm/helper-a64.h|   1 +
 target-arm/translate-a64.c | 120 +
 3 files changed, 149 insertions(+)

diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c
index 8105fb5..da72b7f 100644
--- a/target-arm/helper-a64.c
+++ b/target-arm/helper-a64.c
@@ -24,3 +24,31 @@
 #include sysemu/sysemu.h
 #include qemu/bitops.h
 
+uint32_t HELPER(pstate_add)(uint32_t pstate, uint64_t a1, uint64_t a2,
+uint64_t ar)
+{
+int64_t s1 = a1;
+int64_t s2 = a2;
+int64_t sr = ar;
+
+pstate = ~(PSTATE_N | PSTATE_Z | PSTATE_C | PSTATE_V);
+
+if (sr  0) {
+pstate |= PSTATE_N;
+}
+
+if (!ar) {
+pstate |= PSTATE_Z;
+}
+
+if (ar  (ar  a1)) {
+pstate |= PSTATE_C;
+}
+
+if ((s1  0  s2  0  sr  0) ||
+(s1  0  s2  0  sr  0)) {
+pstate |= PSTATE_V;
+}
+
+return pstate;
+}
diff --git a/target-arm/helper-a64.h b/target-arm/helper-a64.h
index 30ecf78..1492b15 100644
--- a/target-arm/helper-a64.h
+++ b/target-arm/helper-a64.h
@@ -17,3 +17,4 @@
  * License along with this library; if not, see http://www.gnu.org/licenses/.
  */
 
+DEF_HELPER_FLAGS_4(pstate_add, TCG_CALL_NO_RWG_SE, i32, i32, i64, i64, i64)
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 039e73a..2a80715 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -484,6 +484,123 @@ static void handle_ldarx(DisasContext *s, uint32_t insn)
 tcg_temp_free_i64(tcg_addr);
 }
 
+static TCGv_i64 get_shift(int reg, int shift_type, TCGv_i64 tcg_shift,
+  int is_32bit)
+{
+TCGv_i64 r;
+
+r = tcg_temp_new_i64();
+
+/* XXX carry_out */
+switch (shift_type) {
+case 0: /* LSL */
+tcg_gen_shl_i64(r, cpu_reg(reg), tcg_shift);
+break;
+case 1: /* LSR */
+tcg_gen_shr_i64(r, cpu_reg(reg), tcg_shift);
+break;
+case 2: /* ASR */
+if (is_32bit) {
+TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+tcg_gen_ext32s_i64(tcg_tmp, cpu_reg(reg));
+tcg_gen_sar_i64(r, tcg_tmp, tcg_shift);
+tcg_temp_free_i64(tcg_tmp);
+} else {
+tcg_gen_sar_i64(r, cpu_reg(reg), tcg_shift);
+}
+break;
+case 3:
+tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift);
+break;
+}
+
+return r;
+}
+
+static TCGv_i64 get_shifti(int reg, int shift_type, int shift, int is_32bit)
+{
+TCGv_i64 tcg_shift;
+TCGv_i64 r;
+
+if (!shift) {
+r = tcg_temp_new_i64();
+tcg_gen_mov_i64(r, cpu_reg(reg));
+return r;
+}
+
+tcg_shift = tcg_const_i64(shift);
+r = get_shift(reg, shift_type, tcg_shift, is_32bit);
+tcg_temp_free_i64(tcg_shift);
+
+return r;
+}
+
+static void handle_orr(DisasContext *s, uint32_t insn)
+{
+int is_32bit = !get_bits(insn, 31, 1);
+int dest = get_reg(insn);
+int source = get_bits(insn, 5, 5);
+int rm = get_bits(insn, 16, 5);
+int shift_amount = get_sbits(insn, 10, 6);
+int is_n = get_bits(insn, 21, 1);
+int shift_type = get_bits(insn, 22, 2);
+int opc = get_bits(insn, 29, 2);
+bool setflags = (opc == 0x3);
+TCGv_i64 tcg_op2;
+TCGv_i64 tcg_dest;
+
+if (is_32bit  (shift_amount  0)) {
+/* reserved value */
+unallocated_encoding(s);
+}
+
+/* MOV is dest = xzr  (source  ~0) */
+if (!shift_amount  source == 0x1f) {
+if (is_32bit) {
+tcg_gen_ext32u_i64(cpu_reg_sp(dest), cpu_reg(rm));
+} else {
+tcg_gen_mov_i64(cpu_reg_sp(dest), cpu_reg(rm));
+}
+if (is_n) {
+tcg_gen_not_i64(cpu_reg_sp(dest), cpu_reg_sp(dest));
+}
+if (is_32bit) {
+tcg_gen_ext32u_i64(cpu_reg_sp(dest), cpu_reg_sp(dest));
+}
+return;
+}
+
+tcg_op2 = get_shifti(rm, shift_type, shift_amount  (is_32bit ? 31 : 63),
+ is_32bit);
+if (is_n) {
+tcg_gen_not_i64(tcg_op2, tcg_op2);
+}
+
+tcg_dest = cpu_reg(dest);
+switch (opc) {
+case 0x0:
+case 0x3:
+tcg_gen_and_i64(tcg_dest, cpu_reg(source), tcg_op2);
+break;
+case 0x1:
+tcg_gen_or_i64(tcg_dest, cpu_reg(source), tcg_op2);
+break;
+case 0x2:
+tcg_gen_xor_i64(tcg_dest, cpu_reg(source), tcg_op2);
+break;
+}
+
+if (is_32bit) {
+tcg_gen_ext32u_i64(tcg_dest, tcg_dest);
+}
+
+if (setflags) {
+gen_helper_pstate_add(pstate, pstate, tcg_dest, cpu_reg(31), tcg_dest);
+}
+
+tcg_temp_free_i64(tcg_op2);
+}
+
 void disas_a64_insn(CPUARMState *env, DisasContext *s)
 {
 uint32_t insn;
@@ -516,6 +633,9 @@ void disas_a64_insn(CPUARMState *env, DisasContext *s)
 handle_ldarx(s, insn);