Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation
Hello, On 09/27/2013 08:25 PM, Richard Henderson wrote: On 09/26/2013 05:48 PM, Alexander Graf wrote: This patch adds emulation support for the orr instruction. Signed-off-by: Alexander Graf ag...@suse.de --- target-arm/helper-a64.c| 28 +++ target-arm/helper-a64.h| 1 + target-arm/translate-a64.c | 120 + 3 files changed, 149 insertions(+) diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c index 8105fb5..da72b7f 100644 --- a/target-arm/helper-a64.c +++ b/target-arm/helper-a64.c @@ -24,3 +24,31 @@ #include sysemu/sysemu.h #include qemu/bitops.h +uint32_t HELPER(pstate_add)(uint32_t pstate, uint64_t a1, uint64_t a2, +uint64_t ar) +{ +int64_t s1 = a1; +int64_t s2 = a2; +int64_t sr = ar; + +pstate = ~(PSTATE_N | PSTATE_Z | PSTATE_C | PSTATE_V); + +if (sr 0) { +pstate |= PSTATE_N; +} + +if (!ar) { +pstate |= PSTATE_Z; +} + +if (ar (ar a1)) { +pstate |= PSTATE_C; +} + +if ((s1 0 s2 0 sr 0) || +(s1 0 s2 0 sr 0)) { +pstate |= PSTATE_V; +} + +return pstate; +} Why are you not using the same split apart bits as A32? +/* XXX carry_out */ +switch (shift_type) { What carry out? I see no such in the ShiftReg description. +case 3: +tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift); +break; Incorrect rotate for 32bit? +static void handle_orr(DisasContext *s, uint32_t insn) +{ +int is_32bit = !get_bits(insn, 31, 1); +int dest = get_reg(insn); +int source = get_bits(insn, 5, 5); +int rm = get_bits(insn, 16, 5); +int shift_amount = get_sbits(insn, 10, 6); +int is_n = get_bits(insn, 21, 1); +int shift_type = get_bits(insn, 22, 2); +int opc = get_bits(insn, 29, 2); +bool setflags = (opc == 0x3); +TCGv_i64 tcg_op2; +TCGv_i64 tcg_dest; + +if (is_32bit (shift_amount 0)) { +/* reserved value */ +unallocated_encoding(s); +} Why are you extracting shift_amount signed? + +/* MOV is dest = xzr (source ~0) */ Comment is wrong. +if (!shift_amount source == 0x1f) { Besides the comment, is this correct? I am trying to rework this patch, but this part seems incorrect to me. We land here for the AND as well, and if source(rn) is xzr, then I would expect the result to be zero for AND regardless of anything else, and not a MOV. Can we really do this optimization in general here for AND, OR, EOR? Thanks for any clarification, Claudio +if (is_32bit) { +tcg_gen_ext32u_i64(cpu_reg_sp(dest), cpu_reg(rm)); +} else { +tcg_gen_mov_i64(cpu_reg_sp(dest), cpu_reg(rm)); +} +if (is_n) { +tcg_gen_not_i64(cpu_reg_sp(dest), cpu_reg_sp(dest)); +} +if (is_32bit) { +tcg_gen_ext32u_i64(cpu_reg_sp(dest), cpu_reg_sp(dest)); +} These are incorrect -- no sp in the logical ops, but xzr instead. And surely we can emit fewer opcodes for the simple cases here. Since these are the canonical aliases for mov/mvn, it'll pay off. TCGv src = cpu_reg(rm); TCGv dst = cpu_reg(rd); if (is_n) { tcg_gen_not_i64(dst, src); src = dst; } if (is_32bit) { tcg_gen_ext32u_i64(dst, src); } else { tcg_gen_mov_i64(dst, src); } Note that tcg_gen_mov_i64 does the src == dst check, so a simple 64-bit mvn will only emit the not. +tcg_dest = cpu_reg(dest); +switch (opc) { +case 0x0: +case 0x3: +tcg_gen_and_i64(tcg_dest, cpu_reg(source), tcg_op2); +break; +case 0x1: +tcg_gen_or_i64(tcg_dest, cpu_reg(source), tcg_op2); +break; +case 0x2: +tcg_gen_xor_i64(tcg_dest, cpu_reg(source), tcg_op2); +break; +} + +if (is_32bit) { +tcg_gen_ext32u_i64(tcg_dest, tcg_dest); +} + +if (setflags) { +gen_helper_pstate_add(pstate, pstate, tcg_dest, cpu_reg(31), tcg_dest); +} Incorrect flags generated. They're different between add/sub and logical. In particular, C and V are always zero. +handle_orr(s, insn); And please use a more proper name than ORR for something that handles all of the logical insns. r~
Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation
On Mon, Nov 18, 2013 at 11:15 AM, Claudio Fontana claudio.font...@linaro.org wrote: Hello, On 09/27/2013 08:25 PM, Richard Henderson wrote: On 09/26/2013 05:48 PM, Alexander Graf wrote: This patch adds emulation support for the orr instruction. Signed-off-by: Alexander Graf ag...@suse.de --- target-arm/helper-a64.c| 28 +++ target-arm/helper-a64.h| 1 + target-arm/translate-a64.c | 120 + 3 files changed, 149 insertions(+) diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c index 8105fb5..da72b7f 100644 --- a/target-arm/helper-a64.c +++ b/target-arm/helper-a64.c @@ -24,3 +24,31 @@ #include sysemu/sysemu.h #include qemu/bitops.h +uint32_t HELPER(pstate_add)(uint32_t pstate, uint64_t a1, uint64_t a2, +uint64_t ar) +{ +int64_t s1 = a1; +int64_t s2 = a2; +int64_t sr = ar; + +pstate = ~(PSTATE_N | PSTATE_Z | PSTATE_C | PSTATE_V); + +if (sr 0) { +pstate |= PSTATE_N; +} + +if (!ar) { +pstate |= PSTATE_Z; +} + +if (ar (ar a1)) { +pstate |= PSTATE_C; +} + +if ((s1 0 s2 0 sr 0) || +(s1 0 s2 0 sr 0)) { +pstate |= PSTATE_V; +} + +return pstate; +} Why are you not using the same split apart bits as A32? +/* XXX carry_out */ +switch (shift_type) { What carry out? I see no such in the ShiftReg description. +case 3: +tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift); +break; Incorrect rotate for 32bit? +static void handle_orr(DisasContext *s, uint32_t insn) +{ +int is_32bit = !get_bits(insn, 31, 1); +int dest = get_reg(insn); +int source = get_bits(insn, 5, 5); +int rm = get_bits(insn, 16, 5); +int shift_amount = get_sbits(insn, 10, 6); +int is_n = get_bits(insn, 21, 1); +int shift_type = get_bits(insn, 22, 2); +int opc = get_bits(insn, 29, 2); +bool setflags = (opc == 0x3); +TCGv_i64 tcg_op2; +TCGv_i64 tcg_dest; + +if (is_32bit (shift_amount 0)) { +/* reserved value */ +unallocated_encoding(s); +} Why are you extracting shift_amount signed? + +/* MOV is dest = xzr (source ~0) */ Comment is wrong. +if (!shift_amount source == 0x1f) { Besides the comment, is this correct? I am trying to rework this patch, but this part seems incorrect to me. We land here for the AND as well, and if source(rn) is xzr, then I would expect the result to be zero for AND regardless of anything else, and not a MOV. Can we really do this optimization in general here for AND, OR, EOR? That part is definitely wrong: there's a missing check that opc = 1 (ORR/ORN for MOV/MVN). The comment also is very wrong :-) Also note that SP can't be accessed by the shifted reg logical ops as Richard wrote. Laurent Thanks for any clarification, Claudio +if (is_32bit) { +tcg_gen_ext32u_i64(cpu_reg_sp(dest), cpu_reg(rm)); +} else { +tcg_gen_mov_i64(cpu_reg_sp(dest), cpu_reg(rm)); +} +if (is_n) { +tcg_gen_not_i64(cpu_reg_sp(dest), cpu_reg_sp(dest)); +} +if (is_32bit) { +tcg_gen_ext32u_i64(cpu_reg_sp(dest), cpu_reg_sp(dest)); +} These are incorrect -- no sp in the logical ops, but xzr instead. And surely we can emit fewer opcodes for the simple cases here. Since these are the canonical aliases for mov/mvn, it'll pay off. TCGv src = cpu_reg(rm); TCGv dst = cpu_reg(rd); if (is_n) { tcg_gen_not_i64(dst, src); src = dst; } if (is_32bit) { tcg_gen_ext32u_i64(dst, src); } else { tcg_gen_mov_i64(dst, src); } Note that tcg_gen_mov_i64 does the src == dst check, so a simple 64-bit mvn will only emit the not. +tcg_dest = cpu_reg(dest); +switch (opc) { +case 0x0: +case 0x3: +tcg_gen_and_i64(tcg_dest, cpu_reg(source), tcg_op2); +break; +case 0x1: +tcg_gen_or_i64(tcg_dest, cpu_reg(source), tcg_op2); +break; +case 0x2: +tcg_gen_xor_i64(tcg_dest, cpu_reg(source), tcg_op2); +break; +} + +if (is_32bit) { +tcg_gen_ext32u_i64(tcg_dest, tcg_dest); +} + +if (setflags) { +gen_helper_pstate_add(pstate, pstate, tcg_dest, cpu_reg(31), tcg_dest); +} Incorrect flags generated. They're different between add/sub and logical. In particular, C and V are always zero. +handle_orr(s, insn); And please use a more proper name than ORR for something that handles all of the logical insns. r~
Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation
Hi, On Mon, 18 Nov 2013, Claudio Fontana wrote: +case 3: +tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift); +break; Incorrect rotate for 32bit? 32bit rotates and shifts were fixed in a patch later than the 60er series Alex posted. See attached. (Generally there are many fixes to emulated instructions in that branch) +if (!shift_amount source == 0x1f) { Besides the comment, is this correct? No, it needs to check for opc == 1. +tcg_dest = cpu_reg(dest); +switch (opc) { +case 0x0: +case 0x3: +tcg_gen_and_i64(tcg_dest, cpu_reg(source), tcg_op2); +break; +case 0x1: +tcg_gen_or_i64(tcg_dest, cpu_reg(source), tcg_op2); +break; +case 0x2: +tcg_gen_xor_i64(tcg_dest, cpu_reg(source), tcg_op2); +break; +} + +if (is_32bit) { +tcg_gen_ext32u_i64(tcg_dest, tcg_dest); +} + +if (setflags) { +gen_helper_pstate_add(pstate, pstate, tcg_dest, cpu_reg(31), tcg_dest); +} Incorrect flags generated. They're different between add/sub and logical. In particular, C and V are always zero. That's done correctly with the fixed pstate helpers coming with a later patch (see attached as well). reg31 is zero, so that's flags as if for dest == dest + 0, and PSTATE_C and PSTATE_V will be zero. That is, the logical flags are the same as the arithmetic flags for result plus zero with no carry_in. Ciao, Michael.From df54486da31d6329696effa61096eda5ab85395a Mon Sep 17 00:00:00 2001 From: Michael Matz m...@suse.de Date: Sun, 24 Mar 2013 02:52:42 +0100 Subject: [PATCH] Fix 32bit rotates. The 32bit shifts generally weren't careful with the upper parts, either bits could leak in (for right shift) or leak or (for left shift). And rotate was completely off, rotating around bit 63, not 31. This fixes the CAST5 hash algorithm. --- target-arm/translate-a64.c | 30 +++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c index 96dc281..e3941a1 100644 --- a/target-arm/translate-a64.c +++ b/target-arm/translate-a64.c @@ -596,25 +596,49 @@ static TCGv_i64 get_shift(int reg, int shift_type, TCGv_i64 tcg_shift, r = tcg_temp_new_i64(); /* XXX carry_out */ +/* Careful with the width. We work on 64bit, but must make sure + that we zero-extend the result on out, and ignore any upper bits, + that might still be set in that register. */ switch (shift_type) { case 0: /* LSL */ + /* left shift is easy, simply zero-extend on out */ tcg_gen_shl_i64(r, cpu_reg(reg), tcg_shift); + if (is_32bit) + tcg_gen_ext32u_i64 (r, r); break; case 1: /* LSR */ -tcg_gen_shr_i64(r, cpu_reg(reg), tcg_shift); + /* For logical right shift we zero extend first, to zero + the upper bits. We don't need to extend on out. */ + if (is_32bit) { + tcg_gen_ext32u_i64 (r, cpu_reg(reg)); + tcg_gen_shr_i64 (r, r, tcg_shift); + } else + tcg_gen_shr_i64(r, cpu_reg(reg), tcg_shift); break; case 2: /* ASR */ + /* For arithmetic right shift we sign extend first, then shift, + and then need to clear the upper bits again. */ if (is_32bit) { TCGv_i64 tcg_tmp = tcg_temp_new_i64(); tcg_gen_ext32s_i64(tcg_tmp, cpu_reg(reg)); tcg_gen_sar_i64(r, tcg_tmp, tcg_shift); + tcg_gen_ext32u_i64 (r, r); tcg_temp_free_i64(tcg_tmp); } else { tcg_gen_sar_i64(r, cpu_reg(reg), tcg_shift); } break; -case 3: -tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift); +case 3: /* ROR */ + /* For rotation extending doesn't help, we really have to use + a 32bit rotate. */ + if (is_32bit) { + TCGv_i32 tmp = tcg_temp_new_i32(); +tcg_gen_trunc_i64_i32(tmp, cpu_reg(reg)); + tcg_gen_rotr_i32(tmp, tmp, tcg_shift); +tcg_gen_extu_i32_i64(r, tmp); +tcg_temp_free_i32(tmp); + } else + tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift); break; } -- 1.8.1.4 From 33137f8a660750d7d8598c7e467f4ccc8dc5ef85 Mon Sep 17 00:00:00 2001 From: Michael Matz m...@suse.de Date: Sat, 23 Mar 2013 04:53:44 +0100 Subject: [PATCH] Fix the pstate flags helpers ADCS and SBCS/SUBS sometimes gave the wrong results for the C and V flags. This fixes it. --- target-arm/helper-a64.c | 52 - 1 file changed, 12 insertions(+), 40 deletions(-) diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c index 4375bf0..4fcb09b 100644 --- a/target-arm/helper-a64.c +++ b/target-arm/helper-a64.c @@ -7,8 +7,6 @@ uint32_t HELPER(pstate_add)(uint32_t pstate, uint64_t a1, uint64_t a2, uint64_t ar) { -int64_t s1 = a1; -int64_t s2 = a2; int64_t sr = ar; pstate = ~(PSTATE_N | PSTATE_Z | PSTATE_C | PSTATE_V); @@ -21,11 +19,15 @@
Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation
On 18 November 2013 13:12, Michael Matz m...@suse.de wrote: Hi, On Mon, 18 Nov 2013, Claudio Fontana wrote: +case 3: +tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift); +break; Incorrect rotate for 32bit? 32bit rotates and shifts were fixed in a patch later than the 60er series Alex posted. See attached. (Generally there are many fixes to emulated instructions in that branch) I think we're going to need to look through and fold in those fixes, otherwise we'll end up reduplicating that work in the course of code review :-( -- PMM
Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation
On 11/18/2013 02:15 PM, Peter Maydell wrote: On 18 November 2013 13:12, Michael Matz m...@suse.de wrote: Hi, On Mon, 18 Nov 2013, Claudio Fontana wrote: +case 3: +tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift); +break; Incorrect rotate for 32bit? 32bit rotates and shifts were fixed in a patch later than the 60er series Alex posted. See attached. (Generally there are many fixes to emulated instructions in that branch) I think we're going to need to look through and fold in those fixes, otherwise we'll end up reduplicating that work in the course of code review :-( -- PMM Thanks all. Regarding the access to registers in 32 bit mode, and the consequent write to registers in 32 bit mode, I am investigating how to do it a little bit more general, in the sense that generally when we access registers in 32bit mode we will (often) need to ignore the upper bits of the source register, and write zero to the destination register. Not always but often. This could be done once for all to reduce the chances of mistakes. C.
Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation
Btw, in the first patch: On 11/18/2013 02:12 PM, Michael Matz wrote: From df54486da31d6329696effa61096eda5ab85395a Mon Sep 17 00:00:00 2001 From: Michael Matz m...@suse.de Date: Sun, 24 Mar 2013 02:52:42 +0100 Subject: [PATCH] Fix 32bit rotates. The 32bit shifts generally weren't careful with the upper parts, either bits could leak in (for right shift) or leak or (for left shift). And rotate was completely off, rotating around bit 63, not 31. This fixes the CAST5 hash algorithm. --- target-arm/translate-a64.c | 30 +++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c index 96dc281..e3941a1 100644 --- a/target-arm/translate-a64.c +++ b/target-arm/translate-a64.c @@ -596,25 +596,49 @@ static TCGv_i64 get_shift(int reg, int shift_type, TCGv_i64 tcg_shift, r = tcg_temp_new_i64(); /* XXX carry_out */ +/* Careful with the width. We work on 64bit, but must make sure + that we zero-extend the result on out, and ignore any upper bits, + that might still be set in that register. */ switch (shift_type) { case 0: /* LSL */ + /* left shift is easy, simply zero-extend on out */ tcg_gen_shl_i64(r, cpu_reg(reg), tcg_shift); + if (is_32bit) + tcg_gen_ext32u_i64 (r, r); break; case 1: /* LSR */ -tcg_gen_shr_i64(r, cpu_reg(reg), tcg_shift); + /* For logical right shift we zero extend first, to zero +the upper bits. We don't need to extend on out. */ + if (is_32bit) { + tcg_gen_ext32u_i64 (r, cpu_reg(reg)); + tcg_gen_shr_i64 (r, r, tcg_shift); + } else + tcg_gen_shr_i64(r, cpu_reg(reg), tcg_shift); break; case 2: /* ASR */ + /* For arithmetic right shift we sign extend first, then shift, +and then need to clear the upper bits again. */ if (is_32bit) { TCGv_i64 tcg_tmp = tcg_temp_new_i64(); tcg_gen_ext32s_i64(tcg_tmp, cpu_reg(reg)); tcg_gen_sar_i64(r, tcg_tmp, tcg_shift); + tcg_gen_ext32u_i64 (r, r); tcg_temp_free_i64(tcg_tmp); } else { tcg_gen_sar_i64(r, cpu_reg(reg), tcg_shift); } break; -case 3: -tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift); +case 3: /* ROR */ + /* For rotation extending doesn't help, we really have to use +a 32bit rotate. */ + if (is_32bit) { + TCGv_i32 tmp = tcg_temp_new_i32(); +tcg_gen_trunc_i64_i32(tmp, cpu_reg(reg)); + tcg_gen_rotr_i32(tmp, tmp, tcg_shift); Isn't this problematic? We are using gen_rotr_i32, but passing tcg_shift, which is a TCGv_i64. I remember I had compilation failures in the past when I tried something similar, so my understanding is that this can work with a certain compiler under certain compiler options, but is not guaranteed to work in all cases. I think we need to either explicitly convert the tcg_shift to a TCGv_i32, or we need to use an open coded version of the rotr_i64 that inserts at (32 - n) instead of (64 - n) What do you think? C. +tcg_gen_extu_i32_i64(r, tmp); +tcg_temp_free_i32(tmp); + } else + tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift); break; } -- 1.8.1.4
Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation
On 18 November 2013 13:43, Claudio Fontana claudio.font...@linaro.org wrote: We are using gen_rotr_i32, but passing tcg_shift, which is a TCGv_i64. I remember I had compilation failures in the past when I tried something similar, so my understanding is that this can work with a certain compiler under certain compiler options, but is not guaranteed to work in all cases. It's a debug option -- if you build with --enable-debug then TCGv_i32/TCGv_i64 mismatches should always cause compile failures. -- PMM
Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation
Hi, On Mon, 18 Nov 2013, Peter Maydell wrote: +case 3: +tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift); +break; Incorrect rotate for 32bit? 32bit rotates and shifts were fixed in a patch later than the 60er series Alex posted. See attached. (Generally there are many fixes to emulated instructions in that branch) I think we're going to need to look through and fold in those fixes, otherwise we'll end up reduplicating that work in the course of code review :-( Most probably. Authorship will be lost :-/ I was planning to submit all of these once the 60er set of Alex would be applied. If you're reworking that set more of less completely then it indeed makes more sense to fold in those things and so it'd probably be better to have them applied (i.e. basically have our full branch applied when dissecting things). The commits that fix things in the a64 decoder proper (not the linux-user implementation) would be: e14c1a5 softfloat: correctly handle overflow in float[32|64] to uint64 conversion cbc98b1 aarch64: Fix FCVTZU for single float a91f762 aarch64: Fix UZP/ZIP/TRN 644c748 aarch64: Fix 32bit TST 2a717e8 Fix FCVTAS and FCVTAU 0dd22d0 Fix decoding of floating-fixed conversions d52c999 Fix implementation of USHLL/SSHLL cfbb9e1 Fix using uninitialized value ecfdfcd Fix typo in FSUB detection 87fd8ca Increase MAX_OP_PER_INSTR 38452d8 Fix USHLL, and implement other SIMD shifts 4146d40 Fix INS element a62437c Fix fcmp(e) with NaNs ec2b8f3 softfloat: Fix float64_to_uint64 b003867 Fix EXTR for 32bit df54486 Fix 32bit rotates. 33137f8 Fix the pstate flags helpers 75cb838 Don't set flush to zero by default 564e811 Fix 128bit ldr (literal) 0ff91a0 Fix long immediate constants (That's all on top Alex' posted patchset but not git rebased onto it, top of Alex roughly corresponds to commit 40d66b61) Ciao, Michael.
Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation
On 18 November 2013 13:46, Michael Matz m...@suse.de wrote: On Mon, 18 Nov 2013, Peter Maydell wrote: I think we're going to need to look through and fold in those fixes, otherwise we'll end up reduplicating that work in the course of code review :-( Most probably. Authorship will be lost :-/ I was planning to submit all of these once the 60er set of Alex would be applied. If you're reworking that set more of less completely then it indeed makes more sense to fold in those things and so it'd probably be better to have them applied (i.e. basically have our full branch applied when dissecting things). The commits that fix things in the a64 decoder proper (not the linux-user implementation) would be: e14c1a5 softfloat: correctly handle overflow in float[32|64] to uint64 conversion cbc98b1 aarch64: Fix FCVTZU for single float a91f762 aarch64: Fix UZP/ZIP/TRN 644c748 aarch64: Fix 32bit TST 2a717e8 Fix FCVTAS and FCVTAU 0dd22d0 Fix decoding of floating-fixed conversions d52c999 Fix implementation of USHLL/SSHLL cfbb9e1 Fix using uninitialized value ecfdfcd Fix typo in FSUB detection 87fd8ca Increase MAX_OP_PER_INSTR 38452d8 Fix USHLL, and implement other SIMD shifts 4146d40 Fix INS element a62437c Fix fcmp(e) with NaNs ec2b8f3 softfloat: Fix float64_to_uint64 b003867 Fix EXTR for 32bit df54486 Fix 32bit rotates. 33137f8 Fix the pstate flags helpers 75cb838 Don't set flush to zero by default 564e811 Fix 128bit ldr (literal) 0ff91a0 Fix long immediate constants This looks like a small enough list to be tractable to fold in. My suggestion for authorship would be that we have the 'From' line indicate whoever wrote the bulk of the code and add sign-off lines from both of you. (Some of those, like the softfloat fixes, are probably standalone patches anyway.) thanks -- PMM
Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation
Hi, On Mon, 18 Nov 2013, Claudio Fontana wrote: +tcg_gen_trunc_i64_i32(tmp, cpu_reg(reg)); + tcg_gen_rotr_i32(tmp, tmp, tcg_shift); Isn't this problematic? We are using gen_rotr_i32, but passing tcg_shift, which is a TCGv_i64. With CONFIG_DEBUG_TCG it'll break, yes. Though in principle there's no canonical relation between the two argument types for shifts and rotates (unlike addition for example) TCG indeed wants to ensure that typeof(arg2)==typeof(arg1). I remember I had compilation failures in the past when I tried something similar, so my understanding is that this can work with a certain compiler under certain compiler options, but is not guaranteed to work in all cases. I think we need to either explicitly convert the tcg_shift to a TCGv_i32, or we need to use an open coded version of the rotr_i64 that inserts at (32 - n) instead of (64 - n) What do you think? I think converting tcg_shift might eventually lead to better generated code (if tcg is optmizing enough, now or in the future, haven't checked). Ciao, Michael.
Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation
On 11/18/2013 11:55 PM, Michael Matz wrote: I think we need to either explicitly convert the tcg_shift to a TCGv_i32, or we need to use an open coded version of the rotr_i64 that inserts at (32 - n) instead of (64 - n) What do you think? I think converting tcg_shift might eventually lead to better generated code (if tcg is optmizing enough, now or in the future, haven't checked). Agreed. r~
Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation
On 27.09.2013, at 11:25, Richard Henderson r...@twiddle.net wrote: On 09/26/2013 05:48 PM, Alexander Graf wrote: This patch adds emulation support for the orr instruction. Signed-off-by: Alexander Graf ag...@suse.de --- target-arm/helper-a64.c| 28 +++ target-arm/helper-a64.h| 1 + target-arm/translate-a64.c | 120 + 3 files changed, 149 insertions(+) diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c index 8105fb5..da72b7f 100644 --- a/target-arm/helper-a64.c +++ b/target-arm/helper-a64.c @@ -24,3 +24,31 @@ #include sysemu/sysemu.h #include qemu/bitops.h +uint32_t HELPER(pstate_add)(uint32_t pstate, uint64_t a1, uint64_t a2, +uint64_t ar) +{ +int64_t s1 = a1; +int64_t s2 = a2; +int64_t sr = ar; + +pstate = ~(PSTATE_N | PSTATE_Z | PSTATE_C | PSTATE_V); + +if (sr 0) { +pstate |= PSTATE_N; +} + +if (!ar) { +pstate |= PSTATE_Z; +} + +if (ar (ar a1)) { +pstate |= PSTATE_C; +} + +if ((s1 0 s2 0 sr 0) || +(s1 0 s2 0 sr 0)) { +pstate |= PSTATE_V; +} + +return pstate; +} Why are you not using the same split apart bits as A32? There is an architecturally defined register that specifies what pstate looks like and IIRC that includes system level state as well, similar to EFLAGS. So I figured it's more straight forward to use a single variable for it. I don't think it really makes much of a difference either way though. If we see that doing it in a split way makes more sense we can always just switch to that later. Alex
Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation
On 31 October 2013 00:29, Alexander Graf ag...@suse.de wrote: On 27.09.2013, at 11:25, Richard Henderson r...@twiddle.net wrote: Why are you not using the same split apart bits as A32? There is an architecturally defined register that specifies what pstate looks like and IIRC that includes system level state as well, similar to EFLAGS. No, the architecture goes out of its way to point out that pstate is not a register. There are a collection of different state bits which are generally accessible via different MSR/MRS instructions or in some cases not accessible at all. This is a difference from A32. In any case as Richard says we already split NZCV from the rest of CPSR in A32 -- the few places that want a complete 32 bit CPSR call a helper function that assembles it from the various separate parts. I don't think it really makes much of a difference either way though. If we see that doing it in a split way makes more sense we can always just switch to that later. I think the split is less critical for A64 because of the severely reduced conditionality which means we're less likely to be making frequent NZCV checks. However it's probably still worth having because it's pretty nearly free. -- PMM
Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation
On 09/26/2013 05:48 PM, Alexander Graf wrote: This patch adds emulation support for the orr instruction. Signed-off-by: Alexander Graf ag...@suse.de --- target-arm/helper-a64.c| 28 +++ target-arm/helper-a64.h| 1 + target-arm/translate-a64.c | 120 + 3 files changed, 149 insertions(+) diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c index 8105fb5..da72b7f 100644 --- a/target-arm/helper-a64.c +++ b/target-arm/helper-a64.c @@ -24,3 +24,31 @@ #include sysemu/sysemu.h #include qemu/bitops.h +uint32_t HELPER(pstate_add)(uint32_t pstate, uint64_t a1, uint64_t a2, +uint64_t ar) +{ +int64_t s1 = a1; +int64_t s2 = a2; +int64_t sr = ar; + +pstate = ~(PSTATE_N | PSTATE_Z | PSTATE_C | PSTATE_V); + +if (sr 0) { +pstate |= PSTATE_N; +} + +if (!ar) { +pstate |= PSTATE_Z; +} + +if (ar (ar a1)) { +pstate |= PSTATE_C; +} + +if ((s1 0 s2 0 sr 0) || +(s1 0 s2 0 sr 0)) { +pstate |= PSTATE_V; +} + +return pstate; +} Why are you not using the same split apart bits as A32? +/* XXX carry_out */ +switch (shift_type) { What carry out? I see no such in the ShiftReg description. +case 3: +tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift); +break; Incorrect rotate for 32bit? +static void handle_orr(DisasContext *s, uint32_t insn) +{ +int is_32bit = !get_bits(insn, 31, 1); +int dest = get_reg(insn); +int source = get_bits(insn, 5, 5); +int rm = get_bits(insn, 16, 5); +int shift_amount = get_sbits(insn, 10, 6); +int is_n = get_bits(insn, 21, 1); +int shift_type = get_bits(insn, 22, 2); +int opc = get_bits(insn, 29, 2); +bool setflags = (opc == 0x3); +TCGv_i64 tcg_op2; +TCGv_i64 tcg_dest; + +if (is_32bit (shift_amount 0)) { +/* reserved value */ +unallocated_encoding(s); +} Why are you extracting shift_amount signed? + +/* MOV is dest = xzr (source ~0) */ Comment is wrong. +if (!shift_amount source == 0x1f) { +if (is_32bit) { +tcg_gen_ext32u_i64(cpu_reg_sp(dest), cpu_reg(rm)); +} else { +tcg_gen_mov_i64(cpu_reg_sp(dest), cpu_reg(rm)); +} +if (is_n) { +tcg_gen_not_i64(cpu_reg_sp(dest), cpu_reg_sp(dest)); +} +if (is_32bit) { +tcg_gen_ext32u_i64(cpu_reg_sp(dest), cpu_reg_sp(dest)); +} These are incorrect -- no sp in the logical ops, but xzr instead. And surely we can emit fewer opcodes for the simple cases here. Since these are the canonical aliases for mov/mvn, it'll pay off. TCGv src = cpu_reg(rm); TCGv dst = cpu_reg(rd); if (is_n) { tcg_gen_not_i64(dst, src); src = dst; } if (is_32bit) { tcg_gen_ext32u_i64(dst, src); } else { tcg_gen_mov_i64(dst, src); } Note that tcg_gen_mov_i64 does the src == dst check, so a simple 64-bit mvn will only emit the not. +tcg_dest = cpu_reg(dest); +switch (opc) { +case 0x0: +case 0x3: +tcg_gen_and_i64(tcg_dest, cpu_reg(source), tcg_op2); +break; +case 0x1: +tcg_gen_or_i64(tcg_dest, cpu_reg(source), tcg_op2); +break; +case 0x2: +tcg_gen_xor_i64(tcg_dest, cpu_reg(source), tcg_op2); +break; +} + +if (is_32bit) { +tcg_gen_ext32u_i64(tcg_dest, tcg_dest); +} + +if (setflags) { +gen_helper_pstate_add(pstate, pstate, tcg_dest, cpu_reg(31), tcg_dest); +} Incorrect flags generated. They're different between add/sub and logical. In particular, C and V are always zero. +handle_orr(s, insn); And please use a more proper name than ORR for something that handles all of the logical insns. r~
[Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation
This patch adds emulation support for the orr instruction. Signed-off-by: Alexander Graf ag...@suse.de --- target-arm/helper-a64.c| 28 +++ target-arm/helper-a64.h| 1 + target-arm/translate-a64.c | 120 + 3 files changed, 149 insertions(+) diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c index 8105fb5..da72b7f 100644 --- a/target-arm/helper-a64.c +++ b/target-arm/helper-a64.c @@ -24,3 +24,31 @@ #include sysemu/sysemu.h #include qemu/bitops.h +uint32_t HELPER(pstate_add)(uint32_t pstate, uint64_t a1, uint64_t a2, +uint64_t ar) +{ +int64_t s1 = a1; +int64_t s2 = a2; +int64_t sr = ar; + +pstate = ~(PSTATE_N | PSTATE_Z | PSTATE_C | PSTATE_V); + +if (sr 0) { +pstate |= PSTATE_N; +} + +if (!ar) { +pstate |= PSTATE_Z; +} + +if (ar (ar a1)) { +pstate |= PSTATE_C; +} + +if ((s1 0 s2 0 sr 0) || +(s1 0 s2 0 sr 0)) { +pstate |= PSTATE_V; +} + +return pstate; +} diff --git a/target-arm/helper-a64.h b/target-arm/helper-a64.h index 30ecf78..1492b15 100644 --- a/target-arm/helper-a64.h +++ b/target-arm/helper-a64.h @@ -17,3 +17,4 @@ * License along with this library; if not, see http://www.gnu.org/licenses/. */ +DEF_HELPER_FLAGS_4(pstate_add, TCG_CALL_NO_RWG_SE, i32, i32, i64, i64, i64) diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c index 039e73a..2a80715 100644 --- a/target-arm/translate-a64.c +++ b/target-arm/translate-a64.c @@ -484,6 +484,123 @@ static void handle_ldarx(DisasContext *s, uint32_t insn) tcg_temp_free_i64(tcg_addr); } +static TCGv_i64 get_shift(int reg, int shift_type, TCGv_i64 tcg_shift, + int is_32bit) +{ +TCGv_i64 r; + +r = tcg_temp_new_i64(); + +/* XXX carry_out */ +switch (shift_type) { +case 0: /* LSL */ +tcg_gen_shl_i64(r, cpu_reg(reg), tcg_shift); +break; +case 1: /* LSR */ +tcg_gen_shr_i64(r, cpu_reg(reg), tcg_shift); +break; +case 2: /* ASR */ +if (is_32bit) { +TCGv_i64 tcg_tmp = tcg_temp_new_i64(); +tcg_gen_ext32s_i64(tcg_tmp, cpu_reg(reg)); +tcg_gen_sar_i64(r, tcg_tmp, tcg_shift); +tcg_temp_free_i64(tcg_tmp); +} else { +tcg_gen_sar_i64(r, cpu_reg(reg), tcg_shift); +} +break; +case 3: +tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift); +break; +} + +return r; +} + +static TCGv_i64 get_shifti(int reg, int shift_type, int shift, int is_32bit) +{ +TCGv_i64 tcg_shift; +TCGv_i64 r; + +if (!shift) { +r = tcg_temp_new_i64(); +tcg_gen_mov_i64(r, cpu_reg(reg)); +return r; +} + +tcg_shift = tcg_const_i64(shift); +r = get_shift(reg, shift_type, tcg_shift, is_32bit); +tcg_temp_free_i64(tcg_shift); + +return r; +} + +static void handle_orr(DisasContext *s, uint32_t insn) +{ +int is_32bit = !get_bits(insn, 31, 1); +int dest = get_reg(insn); +int source = get_bits(insn, 5, 5); +int rm = get_bits(insn, 16, 5); +int shift_amount = get_sbits(insn, 10, 6); +int is_n = get_bits(insn, 21, 1); +int shift_type = get_bits(insn, 22, 2); +int opc = get_bits(insn, 29, 2); +bool setflags = (opc == 0x3); +TCGv_i64 tcg_op2; +TCGv_i64 tcg_dest; + +if (is_32bit (shift_amount 0)) { +/* reserved value */ +unallocated_encoding(s); +} + +/* MOV is dest = xzr (source ~0) */ +if (!shift_amount source == 0x1f) { +if (is_32bit) { +tcg_gen_ext32u_i64(cpu_reg_sp(dest), cpu_reg(rm)); +} else { +tcg_gen_mov_i64(cpu_reg_sp(dest), cpu_reg(rm)); +} +if (is_n) { +tcg_gen_not_i64(cpu_reg_sp(dest), cpu_reg_sp(dest)); +} +if (is_32bit) { +tcg_gen_ext32u_i64(cpu_reg_sp(dest), cpu_reg_sp(dest)); +} +return; +} + +tcg_op2 = get_shifti(rm, shift_type, shift_amount (is_32bit ? 31 : 63), + is_32bit); +if (is_n) { +tcg_gen_not_i64(tcg_op2, tcg_op2); +} + +tcg_dest = cpu_reg(dest); +switch (opc) { +case 0x0: +case 0x3: +tcg_gen_and_i64(tcg_dest, cpu_reg(source), tcg_op2); +break; +case 0x1: +tcg_gen_or_i64(tcg_dest, cpu_reg(source), tcg_op2); +break; +case 0x2: +tcg_gen_xor_i64(tcg_dest, cpu_reg(source), tcg_op2); +break; +} + +if (is_32bit) { +tcg_gen_ext32u_i64(tcg_dest, tcg_dest); +} + +if (setflags) { +gen_helper_pstate_add(pstate, pstate, tcg_dest, cpu_reg(31), tcg_dest); +} + +tcg_temp_free_i64(tcg_op2); +} + void disas_a64_insn(CPUARMState *env, DisasContext *s) { uint32_t insn; @@ -516,6 +633,9 @@ void disas_a64_insn(CPUARMState *env, DisasContext *s) handle_ldarx(s, insn);