Re: Disabling TLS address caching to help QEMU on GNU/Linux
Hello, On Thu, 22 Jul 2021, Richard Biener via Gcc wrote: > But how does TLS usage transfer between threads? On the gimple level > the TLS pointer is not visible and thus we'd happily CSE its address: Yes. All take-address operations then need to be encoded explicitely with a non-CSE-able internal function (or so): --> __ifn_get_tls_addr(); ( in the argument just so that it's clear that it doesn't access the value at x and to get the current effects of address-taken marking of ADDR_EXPR). (Or of course, ADDR_EXPR could be taken as unstable when applied to TLS decls). Quite a big hammer IMHO. Ciao, Michael.
[Qemu-devel] [PATCH v2] ppc: Fix size of ppc64 xer register
The normal gdb definition of the XER registers is only 32 bit, and that's what the current version of power64-core.xml also says (seems copied from gdb's). But qemu's idea of the XER register is target_ulong (in CPUPPCState, ppc_gdb_register_len and ppc_cpu_gdb_read_register) That mismatch leads to the following message when attaching with gdb: Truncated register 32 in remote 'g' packet (and following on that qemu stops responding). The simple fix is to say the truth in the .xml file. But the better fix is to actually make it 32bit on the wire, as old gdbs don't support XML files for describing registers. Also the XER state in qemu doesn't seem to use the high 32 bits, so sending it off to gdb doesn't seem worthwhile. [v2: fix formatting in moved line and adjust others] Signed-off-by: Michael Matz <m...@suse.de> --- target/ppc/gdbstub.c | 26 +- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/target/ppc/gdbstub.c b/target/ppc/gdbstub.c index 7a33813..688749d 100644 --- a/target/ppc/gdbstub.c +++ b/target/ppc/gdbstub.c @@ -33,14 +33,14 @@ static int ppc_gdb_register_len_apple(int n) return 8; case 64 ... 95: return 16; -case 64+32: /* nip */ -case 65+32: /* msr */ -case 67+32: /* lr */ -case 68+32: /* ctr */ -case 69+32: /* xer */ -case 70+32: /* fpscr */ +case 64 + 32: /* nip */ +case 65 + 32: /* msr */ +case 67 + 32: /* lr */ +case 68 + 32: /* ctr */ +case 70 + 32: /* fpscr */ return 8; -case 66+32: /* cr */ +case 66 + 32: /* cr */ +case 69 + 32: /* xer */ return 4; default: return 0; @@ -61,6 +61,8 @@ static int ppc_gdb_register_len(int n) return 8; case 66: /* cr */ +case 69: +/* xer */ return 4; case 64: /* nip */ @@ -70,8 +72,6 @@ static int ppc_gdb_register_len(int n) /* lr */ case 68: /* ctr */ -case 69: -/* xer */ return sizeof(target_ulong); case 70: /* fpscr */ @@ -152,7 +152,7 @@ int ppc_cpu_gdb_read_register(CPUState *cs, uint8_t *mem_buf, int n) gdb_get_regl(mem_buf, env->ctr); break; case 69: -gdb_get_regl(mem_buf, env->xer); +gdb_get_reg32(mem_buf, env->xer); break; case 70: gdb_get_reg32(mem_buf, env->fpscr); @@ -208,7 +208,7 @@ int ppc_cpu_gdb_read_register_apple(CPUState *cs, uint8_t *mem_buf, int n) gdb_get_reg64(mem_buf, env->ctr); break; case 69 + 32: -gdb_get_reg64(mem_buf, env->xer); +gdb_get_reg32(mem_buf, env->xer); break; case 70 + 32: gdb_get_reg64(mem_buf, env->fpscr); @@ -259,7 +259,7 @@ int ppc_cpu_gdb_write_register(CPUState *cs, uint8_t *mem_buf, int n) env->ctr = ldtul_p(mem_buf); break; case 69: -env->xer = ldtul_p(mem_buf); +env->xer = ldl_p(mem_buf); break; case 70: /* fpscr */ @@ -309,7 +309,7 @@ int ppc_cpu_gdb_write_register_apple(CPUState *cs, uint8_t *mem_buf, int n) env->ctr = ldq_p(mem_buf); break; case 69 + 32: -env->xer = ldq_p(mem_buf); +env->xer = ldl_p(mem_buf); break; case 70 + 32: /* fpscr */ -- 1.8.1.4
[Qemu-devel] [PATCH] ppc: Fix size of ppc64 xer register (fwd)
The normal gdb definition of the XER registers is only 32 bit, and that's what the current version of power64-core.xml also says (seems copied from gdb's). But qemu's idea of the XER register is target_ulong (in CPUPPCState, ppc_gdb_register_len and ppc_cpu_gdb_read_register) That mismatch leads to the following message when attaching with gdb: Truncated register 32 in remote 'g' packet (and following on that qemu stops responding). The simple fix is to say the truth in the .xml file. But the better fix is to actually make it 32bit on the wire, as old gdbs don't support XML files for describing registers. Also the XER state in qemu doesn't seem to use the high 32 bits, so sending it off to gdb doesn't seem worthwhile. Signed-off-by: Michael Matz <m...@suse.de> --- target/ppc/gdbstub.c | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/target/ppc/gdbstub.c b/target/ppc/gdbstub.c index 7a33813..b6f6693 100644 --- a/target/ppc/gdbstub.c +++ b/target/ppc/gdbstub.c @@ -37,10 +37,10 @@ static int ppc_gdb_register_len_apple(int n) case 65+32: /* msr */ case 67+32: /* lr */ case 68+32: /* ctr */ -case 69+32: /* xer */ case 70+32: /* fpscr */ return 8; case 66+32: /* cr */ +case 69+32: /* xer */ return 4; default: return 0; @@ -61,6 +61,8 @@ static int ppc_gdb_register_len(int n) return 8; case 66: /* cr */ +case 69: +/* xer */ return 4; case 64: /* nip */ @@ -70,8 +72,6 @@ static int ppc_gdb_register_len(int n) /* lr */ case 68: /* ctr */ -case 69: -/* xer */ return sizeof(target_ulong); case 70: /* fpscr */ @@ -152,7 +152,7 @@ int ppc_cpu_gdb_read_register(CPUState *cs, uint8_t *mem_buf, int n) gdb_get_regl(mem_buf, env->ctr); break; case 69: -gdb_get_regl(mem_buf, env->xer); +gdb_get_reg32(mem_buf, env->xer); break; case 70: gdb_get_reg32(mem_buf, env->fpscr); @@ -208,7 +208,7 @@ int ppc_cpu_gdb_read_register_apple(CPUState *cs, uint8_t *mem_buf, int n) gdb_get_reg64(mem_buf, env->ctr); break; case 69 + 32: -gdb_get_reg64(mem_buf, env->xer); +gdb_get_reg32(mem_buf, env->xer); break; case 70 + 32: gdb_get_reg64(mem_buf, env->fpscr); @@ -259,7 +259,7 @@ int ppc_cpu_gdb_write_register(CPUState *cs, uint8_t *mem_buf, int n) env->ctr = ldtul_p(mem_buf); break; case 69: -env->xer = ldtul_p(mem_buf); +env->xer = ldl_p(mem_buf); break; case 70: /* fpscr */ @@ -309,7 +309,7 @@ int ppc_cpu_gdb_write_register_apple(CPUState *cs, uint8_t *mem_buf, int n) env->ctr = ldq_p(mem_buf); break; case 69 + 32: -env->xer = ldq_p(mem_buf); +env->xer = ldl_p(mem_buf); break; case 70 + 32: /* fpscr */ -- 1.8.1.4
Re: [Qemu-devel] [PATCH 16/16] exec-all.h: Increase MAX_OP_PER_INSTR for ARM A64 decoder
Hi, On Sun, 9 Mar 2014, Peter Maydell wrote: diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c index 6a3597d..2435c95 100644 --- a/target-arm/translate-a64.c +++ b/target-arm/translate-a64.c @@ -9007,9 +9007,8 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn) case 0x19: /* FRINTM */ case 0x38: /* FRINTP */ case 0x39: /* FRINTZ */ -case 0x58: /* FRINTA */ need_rmode = true; -rmode = extract32(opcode, 5, 2) | (extract32(opcode, 0, 1) 1); +rmode = extract32(opcode, 5, 1) | (extract32(opcode, 0, 1) 1); /* fall through */ case 0x59: /* FRINTX */ case 0x79: /* FRINTI */ @@ -9019,6 +9018,15 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn) return; } break; +case 0x58: /* FRINTA */ +need_rmode = true; +rmode = FPROUNDING_TIEAWAY; +need_fpstatus = true; +if (size == 3 !is_q) { +unallocated_encoding(s); +return; +} +break; Merge the above into [15/16] ? Ciao, Michael.
Re: [Qemu-devel] Call for testing QEMU aarch64-linux-user emulation
Hi, On Mon, 10 Mar 2014, Alex Bennée wrote: Not sure there's much point looking very deeply into this. Java programs are threaded, threaded programs don't work under QEMU = don't try to run Java under QEMU :-) Having said that I'm sure there was another SIGILL related crash on Launchpad and I think we would be interested in those. Is JAVA really that buggy under QEMU just because of threading? Generally speaking, yes. I've never seen problems with openjdk (with the suse tree), so the segfault above might be also be related to the segfault handling for read-only data segments containing code (the signal trampoline on stack), for which the patches were recently proposed upstream. Ciao, Michael.
Re: [Qemu-devel] Call for testing QEMU aarch64-linux-user emulation
Hi, On Wed, 26 Feb 2014, Dann Frazier wrote: I've narrowed down the changes that seem to prevent both types of segfaults to the following changes that introduce a wrapper around sigprocmask: https://github.com/susematz/qemu/commit/f1542ae9fe10d5a241fc2624ecaef5f0948e3472 https://github.com/susematz/qemu/commit/4e5e1607758841c760cda4652b0ee7a6bc6eb79d https://github.com/susematz/qemu/commit/63eb8d3ea58f58d5857153b0c632def1bbd05781 I'm not sure if this is a real fix or just papering over my issue - It's fixing the issue, but strictly speaking introduces an QoI problem. SIGSEGV must not be controllable by the guest, it needs to be always deliverable to qemu; that is what's fixed. The QoI problem introduced is that with the implementation as is, the fiddling with SIGSEGV is detectable by the guest. E.g. if it installs a segv handler, blocks segv, then forces a segfault, checks that it didn't arrive, then unblocks segv and checks that it now arrives, such testcase would be able to detect that in fact it couldn't block SIGSEGV. Luckily for us, the effect of blocking SIGSEGV and then generating one in other ways than kill/sigqueue/raise (e.g. by writing to NULL) are undefined, so in practice that QoI issue doesn't matter. To fix also that latter part it'd need a further per-thread flag segv_blocked_p which needs to be checked before actually delivering a guest-directed SIGSEGV (in comparison to a qemu-directed SEGV), and otherwise requeue it. That's made a bit complicated when the SEGV was process-directed (not thread-directed) because in that case it needs to be delivered as long as there's _any_ thread which has it unblocked. So given the above undefinedness for sane uses of SEGVs it didn't seem worth the complication of having an undetectable virtualization of SIGSEGV. but either way, are these changes reasonable for upstream submission? IIRC the first two commits (from Alex Barcelo) were submitted once in the past, but fell through the cracks. Ciao, Michael.
Re: [Qemu-devel] Call for testing QEMU aarch64-linux-user emulation
Hi, On Tue, 25 Feb 2014, Andreas Färber wrote: There are some pretty large differences between these trees with respect to signal syscalls - is that the likely culprit? Quite likely. We explicitly concentrated on the arch64 specific instruction emulation leaving more generic patches to flow in from SUSE as they matured. I guess it's time to go through the remaining patches and see what's up-streamable. Alex/Michael, Are any of these patches in flight now? I don't think so, Alex seems to hate cleaning that stuff up... :P Compare https://github.com/openSUSE/qemu/commits/opensuse-1.7 for our general queue. We have patches adding locking to TCG, and there's a hack pinning the CPU somewhere. The locking and pinning is all wrong (resp. overbroad). The aarch64-1.6 branch contains better implementations for that and some actual fixes for aarch64' userspace. Somehow I don't find the time to go through our patches in linux-user and submit them. The biggest road-block is that signal vs syscall handling is fundamentally broken in linux-user and it's unfixable without assembler implementations of the syscall caller. That is also still broken on the suse branch where I tried various ways to fix that until coming to that conclusion. Ciao, Michael.
Re: [Qemu-devel] Call for testing QEMU aarch64-linux-user emulation
Hi, On Tue, 25 Feb 2014, Peter Maydell wrote: On 25 February 2014 13:33, Michael Matz m...@suse.de wrote The biggest road-block is that signal vs syscall handling is fundamentally broken in linux-user and it's unfixable without assembler implementations of the syscall caller. I'm not entirely sure it's possible to fix even with hand-rolled assembly, to be honest. I am fairly sure. The problem is simply to detect if the signal arrived while inside the kernel (doing the syscalls job) or still or already outside. This structure helps with that: before: setup args and stuff for syscall to do atsys: syscall insn (single insn!) after: mov return, return-register-per-psABI realafter: rest of stuff When a signal arrives you look at the return address the kernel puts into the siginfo. Several cases: * before = retaddr atsys: syscall hasn't yet started, so break syscall sequence, handle signal in main loop, redo the syscall. * atsys == retaddr syscall has started and the kernel wants to restart it after sighandler returns, _or_ syscall was just about to be started. No matter what, the right thing to do is to actually do the syscall (again) after handling the signal. So break syscall sequence, handle signal in main loop, (re)do the syscall. * after = retaddr realafter: syscall is complete but return value not yet in some variable but still in register (or other post-syscall work that still needs doing isn't complete yet); nothing interesting to do, just let it continue with the syscall sequence, handle signal in main loop after that one returned. * retaddr any other value: uninteresting; actually I'm not sure we'd need the distinction between after and realafter. Handle signal as usual in main loop. The important thing for qemu is to know precisely if the signal arrived before the syscall was started (or is to be restarted), or after it returned, and for that the compiler must not be allowed to insert any code between atsys and after. However there are a bunch of bugfixes in your tree which it would be really nice to see upstreamed: the sendmmsg patch, for instance. We can at least get the aarch64 support to the same level as the 32 bit arm linux-user setup, which is genuinely useful to people despite the well known races and locking issues. Yeah. Ciao, Michael.
Re: [Qemu-devel] [PATCH v3 0/5] disas: add libvixl to support A64 disassembly
Hi, On Wed, 5 Feb 2014, Peter Maydell wrote: Changes v2 - v3: * added a README clarifying that libvixl's disassembly support is not complete and that contributions should go to libvixl upstream first for preference Hmm, why aren't we simply using the binutils disassembler? It's also (C) by ARM, so there shouldn't be any relicensing problems. And it does support AdvSIMD and system instructions. Ciao, Michael.
Re: [Qemu-devel] [PATCH v3 0/5] disas: add libvixl to support A64 disassembly
Hi, On Thu, 6 Feb 2014, Peter Maydell wrote: On 6 February 2014 13:45, Michael Matz m...@suse.de wrote: Hmm, why aren't we simply using the binutils disassembler? It's also (C) by ARM, so there shouldn't be any relicensing problems. And it does support AdvSIMD and system instructions. It is GPLv3 which is not compatible with GPLv2 which QEMU requires. The contribution process for binutils involves a copyright assignment which means the FSF now have the copyright there, as I understand it. The FSF always grants back rights on the contribution to the contributor. ARM could simply double-license their original contribution of the disassembler. Ciao, Michael.
Re: [Qemu-devel] [PATCH v3 0/5] disas: add libvixl to support A64 disassembly
Hi, On Thu, 6 Feb 2014, Christopher Covington wrote: The FSF always grants back rights on the contribution to the contributor. ARM could simply double-license their original contribution of the disassembler. Is dual licensing always possible given a grant-back? What if the contribution is a derivative of a GPL-3.0 licensed work? Then it's not. I don't think that's the case with ARMs aarch64 disasm, but ultimately only ARM can say, and Peter already hinted that this all isn't possible. Which is a pity, duplicate work, inferior capability (at least for now), and such ... Ciao, Michael.
Re: [Qemu-devel] [PATCH 26/60] AArch64: Add ADR instruction emulation
Hi, On Tue, 19 Nov 2013, Claudio Fontana wrote: +uint64_t imm; +uint64_t base; + +imm = get_sbits(insn, 5, 19) 2; +imm |= get_bits(insn, 29, 2); does this work with negative values? Yes. get_sbits returns a sign extended (32bit) int, the shift doesn't change that, the conversion to uint64_t first sign extends to 64bit and then converts to unsigned (conceptually). From then on it's an unsigned value (with high bits set when input was negative), but two complement arithmetic makes that irrelevant. Ciao, Michael.
Re: [Qemu-devel] [PATCH 50/60] AArch64: Add Floating-point-fixed-point
Hi, On Tue, 19 Nov 2013, Janne Grunau wrote: +static void handle_fpfpconv(DisasContext *s, uint32_t insn) +{ +int opcode = get_bits(insn, 16, 3); +int rmode = get_bits(insn, 20, 2); rmode is at 19 +case 0x1: /* [S|U]CVTF (scalar-float) */ and it's case 0x0: for [S|U]CVTF Both were fixed after Alex' series with 0dd22d0c: https://github.com/susematz/qemu/commit/0dd22d0c5cd1dcdccd5df953f1981d461d3054e5 Ciao, Michael.
Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation
Hi, On Mon, 18 Nov 2013, Claudio Fontana wrote: +case 3: +tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift); +break; Incorrect rotate for 32bit? 32bit rotates and shifts were fixed in a patch later than the 60er series Alex posted. See attached. (Generally there are many fixes to emulated instructions in that branch) +if (!shift_amount source == 0x1f) { Besides the comment, is this correct? No, it needs to check for opc == 1. +tcg_dest = cpu_reg(dest); +switch (opc) { +case 0x0: +case 0x3: +tcg_gen_and_i64(tcg_dest, cpu_reg(source), tcg_op2); +break; +case 0x1: +tcg_gen_or_i64(tcg_dest, cpu_reg(source), tcg_op2); +break; +case 0x2: +tcg_gen_xor_i64(tcg_dest, cpu_reg(source), tcg_op2); +break; +} + +if (is_32bit) { +tcg_gen_ext32u_i64(tcg_dest, tcg_dest); +} + +if (setflags) { +gen_helper_pstate_add(pstate, pstate, tcg_dest, cpu_reg(31), tcg_dest); +} Incorrect flags generated. They're different between add/sub and logical. In particular, C and V are always zero. That's done correctly with the fixed pstate helpers coming with a later patch (see attached as well). reg31 is zero, so that's flags as if for dest == dest + 0, and PSTATE_C and PSTATE_V will be zero. That is, the logical flags are the same as the arithmetic flags for result plus zero with no carry_in. Ciao, Michael.From df54486da31d6329696effa61096eda5ab85395a Mon Sep 17 00:00:00 2001 From: Michael Matz m...@suse.de Date: Sun, 24 Mar 2013 02:52:42 +0100 Subject: [PATCH] Fix 32bit rotates. The 32bit shifts generally weren't careful with the upper parts, either bits could leak in (for right shift) or leak or (for left shift). And rotate was completely off, rotating around bit 63, not 31. This fixes the CAST5 hash algorithm. --- target-arm/translate-a64.c | 30 +++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c index 96dc281..e3941a1 100644 --- a/target-arm/translate-a64.c +++ b/target-arm/translate-a64.c @@ -596,25 +596,49 @@ static TCGv_i64 get_shift(int reg, int shift_type, TCGv_i64 tcg_shift, r = tcg_temp_new_i64(); /* XXX carry_out */ +/* Careful with the width. We work on 64bit, but must make sure + that we zero-extend the result on out, and ignore any upper bits, + that might still be set in that register. */ switch (shift_type) { case 0: /* LSL */ + /* left shift is easy, simply zero-extend on out */ tcg_gen_shl_i64(r, cpu_reg(reg), tcg_shift); + if (is_32bit) + tcg_gen_ext32u_i64 (r, r); break; case 1: /* LSR */ -tcg_gen_shr_i64(r, cpu_reg(reg), tcg_shift); + /* For logical right shift we zero extend first, to zero + the upper bits. We don't need to extend on out. */ + if (is_32bit) { + tcg_gen_ext32u_i64 (r, cpu_reg(reg)); + tcg_gen_shr_i64 (r, r, tcg_shift); + } else + tcg_gen_shr_i64(r, cpu_reg(reg), tcg_shift); break; case 2: /* ASR */ + /* For arithmetic right shift we sign extend first, then shift, + and then need to clear the upper bits again. */ if (is_32bit) { TCGv_i64 tcg_tmp = tcg_temp_new_i64(); tcg_gen_ext32s_i64(tcg_tmp, cpu_reg(reg)); tcg_gen_sar_i64(r, tcg_tmp, tcg_shift); + tcg_gen_ext32u_i64 (r, r); tcg_temp_free_i64(tcg_tmp); } else { tcg_gen_sar_i64(r, cpu_reg(reg), tcg_shift); } break; -case 3: -tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift); +case 3: /* ROR */ + /* For rotation extending doesn't help, we really have to use + a 32bit rotate. */ + if (is_32bit) { + TCGv_i32 tmp = tcg_temp_new_i32(); +tcg_gen_trunc_i64_i32(tmp, cpu_reg(reg)); + tcg_gen_rotr_i32(tmp, tmp, tcg_shift); +tcg_gen_extu_i32_i64(r, tmp); +tcg_temp_free_i32(tmp); + } else + tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift); break; } -- 1.8.1.4 From 33137f8a660750d7d8598c7e467f4ccc8dc5ef85 Mon Sep 17 00:00:00 2001 From: Michael Matz m...@suse.de Date: Sat, 23 Mar 2013 04:53:44 +0100 Subject: [PATCH] Fix the pstate flags helpers ADCS and SBCS/SUBS sometimes gave the wrong results for the C and V flags. This fixes it. --- target-arm/helper-a64.c | 52 - 1 file changed, 12 insertions(+), 40 deletions(-) diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c index 4375bf0..4fcb09b 100644 --- a/target-arm/helper-a64.c +++ b/target-arm/helper-a64.c @@ -7,8 +7,6 @@ uint32_t HELPER(pstate_add)(uint32_t pstate, uint64_t a1, uint64_t a2, uint64_t ar) { -int64_t s1 = a1; -int64_t s2 = a2; int64_t sr = ar; pstate = ~(PSTATE_N | PSTATE_Z | PSTATE_C | PSTATE_V); @@ -21,11 +19,15
Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation
Hi, On Mon, 18 Nov 2013, Peter Maydell wrote: +case 3: +tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift); +break; Incorrect rotate for 32bit? 32bit rotates and shifts were fixed in a patch later than the 60er series Alex posted. See attached. (Generally there are many fixes to emulated instructions in that branch) I think we're going to need to look through and fold in those fixes, otherwise we'll end up reduplicating that work in the course of code review :-( Most probably. Authorship will be lost :-/ I was planning to submit all of these once the 60er set of Alex would be applied. If you're reworking that set more of less completely then it indeed makes more sense to fold in those things and so it'd probably be better to have them applied (i.e. basically have our full branch applied when dissecting things). The commits that fix things in the a64 decoder proper (not the linux-user implementation) would be: e14c1a5 softfloat: correctly handle overflow in float[32|64] to uint64 conversion cbc98b1 aarch64: Fix FCVTZU for single float a91f762 aarch64: Fix UZP/ZIP/TRN 644c748 aarch64: Fix 32bit TST 2a717e8 Fix FCVTAS and FCVTAU 0dd22d0 Fix decoding of floating-fixed conversions d52c999 Fix implementation of USHLL/SSHLL cfbb9e1 Fix using uninitialized value ecfdfcd Fix typo in FSUB detection 87fd8ca Increase MAX_OP_PER_INSTR 38452d8 Fix USHLL, and implement other SIMD shifts 4146d40 Fix INS element a62437c Fix fcmp(e) with NaNs ec2b8f3 softfloat: Fix float64_to_uint64 b003867 Fix EXTR for 32bit df54486 Fix 32bit rotates. 33137f8 Fix the pstate flags helpers 75cb838 Don't set flush to zero by default 564e811 Fix 128bit ldr (literal) 0ff91a0 Fix long immediate constants (That's all on top Alex' posted patchset but not git rebased onto it, top of Alex roughly corresponds to commit 40d66b61) Ciao, Michael.
Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation
Hi, On Mon, 18 Nov 2013, Claudio Fontana wrote: +tcg_gen_trunc_i64_i32(tmp, cpu_reg(reg)); + tcg_gen_rotr_i32(tmp, tmp, tcg_shift); Isn't this problematic? We are using gen_rotr_i32, but passing tcg_shift, which is a TCGv_i64. With CONFIG_DEBUG_TCG it'll break, yes. Though in principle there's no canonical relation between the two argument types for shifts and rotates (unlike addition for example) TCG indeed wants to ensure that typeof(arg2)==typeof(arg1). I remember I had compilation failures in the past when I tried something similar, so my understanding is that this can work with a certain compiler under certain compiler options, but is not guaranteed to work in all cases. I think we need to either explicitly convert the tcg_shift to a TCGv_i32, or we need to use an open coded version of the rotr_i64 that inserts at (32 - n) instead of (64 - n) What do you think? I think converting tcg_shift might eventually lead to better generated code (if tcg is optmizing enough, now or in the future, haven't checked). Ciao, Michael.
Re: [Qemu-devel] Checking the state of arm64-linux-user
Hi, On Tue, 29 Oct 2013, Peter Maydell wrote: On 29 October 2013 18:55, Alex Bennée alex.ben...@linaro.org wrote: I think the problem is arm64 has been posted in several dependant patch sets hence working from a git tree. I think for now I'll take off the -Werror training wheels and see how far it gets. http://lists.nongnu.org/archive/html/qemu-devel/2013-09/msg04503.html is Alex's patchset which he rebased on to a qemu master, so if you can't wait for v2 then that's the one to look at. Alex's patchset covers only the first part of the whole thing, and so misses some instructions and fixes. If you merely want to use qemu-arm64 you're better off using the tree from me. Alex B.: I'm building that tree with gcc 4.7 and this config: % ../configure --disable-system --enable-linux-user --disable-werror \ --static --disable-linux-aio --disable-strip \ --target-list=arm64-linux-user % make Ciao, Michael.
[Qemu-devel] Re: CPU type qemu64 breaks guest code
Hi, On Mon, 21 Mar 2011, Andre Przywara wrote: .name = qemu64, .level = 4, .vendor1 = CPUID_VENDOR_AMD_1, .vendor2 = CPUID_VENDOR_AMD_2, .vendor3 = CPUID_VENDOR_AMD_3, .family = 6, .model = 2, .stepping = 3, ... Well, yes, this is strange, and a different CPU model mimicking some really existing CPU would be better. But in my experience this opens up a can of worms, since a different family will trigger a lot of other nasty code that was silent before. Mimicking as a really existing CPU would trigger exactly those code paths that are triggered when that same code is running on real hardware. If such hypothetical problems were real they would occur for non-emulated hardware already. But they don't. else if (strcmp (vendor_string, AuthenticAMD) == 0) switch (family) { case 5: case 6: abort (); The reasoning behind it is that for AMD, all 64-bit CPUs were family 15. I guess that should be fixed there, as it is obviously nonsense. Not really. The above is for the x86_64 code paths, i.e. 64bit code. There never were, and there never will be, AMD processors capable of running 64bit code that have family 5 or 6. The abort can't ever trigger on hardware or correctly emulated hardware. I haven't check what they actually need that family 6 does not provide, if it is long mode, then this bit should be checked. It's not about need. It's about tuning and expectations. gmp wants to tune itself according to the hardware it runs on, hence it switches on the vendor and family/model. And in order not to have to handle combinations that can't exist in the real world (which seems sane to me) the aborts where put in place. Now there's something to be said about being lenient in what you accept, but it's not wrong to be strict. I found that there is no valid combination for both Intel and AMD. Of course not. Why should there? The FMS combination obviously can't exist independend of the (claimed) vendor. Trying to go for one FMS that fits all is going to fail, how could it be different? Ciao, Michael.
Re: [Qemu-devel] [PATCH 02/40] elf: Add notes implementation
Hi, On Mon, 1 Nov 2010, Alexander Graf wrote: No. Pointers of same type which are not void pointers can be compared. There is even a data type ptrdiff_t, so you can also compare their difference with zero. Let's ask someone who definitely knows :). Michael, is code like char *x = a, *y = b; if (x y) { ... } Pointers can be compared iff they point into the same object (including right after the object), so it depends on what a and b were above. This would be invalid for instance: int o1, o2; int *p1 = o1, *p2 = o2; if (p1 p2) ... valid? Or do I first have to cast x and y to unsigned longs or uintptr_t? For doing a valid pointer comparison you don't have to cast anything. Casting doesn't make an invalid comparison valid. Ciao, Michael.
Re: [Qemu-devel] Re: [PATCH 1/5] Fix i386 Host
Hi, On Fri, 18 Jan 2008, Johannes Schindelin wrote: Well, I can tell you why, but it doesn't help you: the 3.4.2 compiler has different deficiencies in reload than the 4.x line of compilers. To make the whole thingy work on all compilers trying and testing is required to avoid all these different deficiencies. My patch is partly real bug fixes (the q constraint part for instance) and partly changes helping to lessen the register pressure which reload needs to fix (the %ecx thingy for instance). Both of these might expose the bugs in 3.4.2, in which case we need to hack around those as well. That's what I tried to do with the patch from http://article.gmane.org/gmane.comp.emulators.qemu/22762 , but it needs of course testing by someone who actually uses 3.4.2. I just downloaded it, and tested it on MinGW with 3.4.2, and I still get the can't find register in class `Q_REGS´ while reloading `asm´ error. Bummer. As a test of theory (I'm not proposing this as patch), can you see what happens if you make vtmp volatile, i.e. make it look like so ? : #if DATA_SIZE == 1 || DATA_SIZE == 2 volatile RES_TYPE vtmp = v; #endif Bah, remote work-arounding of reload sucks :-) Ciao, Michael.
Re: [Qemu-devel] Re: [PATCH 1/5] Fix i386 Host
Hi, On Fri, 18 Jan 2008, Johannes Schindelin wrote: asm ( ... movzbl %b1, %%edx\n ... : : r (blubb), r (bla) ); Okay, but this only concerns gcc4, apparently. No, it's nothing to do with GCC. But apparently it has! With gcc 4 I did never get the error. As I tried to explain, this is pure luck. Which probably means that gcc 4 did _not_ use ecx, and therefore it does not have to be pushed and popped. We are talking about the hunk using the q constraint for operand 1 in st[bw]_kernel. The change in the clobber list (and the associated saving/restoring of %ecx around the call) is something entirely different. Which -- judging from how commonly glue() is called in op.c -- could mean a performance hit. glue() is a macro, the function called is stw_kernel (inline function). I am all for supporting gcc 3, but please, please not at the cost of having a performance hit for _existing_ users. Have you measured this? This function actually does a call to stw_mmu, a rather slow and big function, the overhead of one register store more or less is probably zero. But that point is mood anyway. When it works without the q constraint in gcc 3.4.2 it only does so, because GCC allocates one of the ax-dx registers to that operand (by luck, not by design). As T1 is coming in in esi there anyway existed a reg-reg move already, so you pay that performance hit (if you like to call it such) already. Only if you want to trust your luck. I fear I don't have gcc 3.4.2 lying around anywhere, so I can't really help debugging this reload breakage in that GCC version. It might help to introduce a temporary to guide GCC through this problematic reload case by detaching the global register variable from the asm operand. For cases where it's no problem this should be optimized away, so doesn't inhibit a performance cost. What I mean is something like the below. If someone with gcc 3.4.2 could test that ... I do ask myself how gcc would optimise away instructions that are explicitely written in the asm() statement. If it does so, I consider this a serious bug in gcc. My patch in the last mail introduces a copy in C (to vtmp), _that_ can be optimized away under the right circumstances. Of course GCC does not change the asm template in any way. Ciao, Michael.
Re: [Qemu-devel] Re: [PATCH 1/5] Fix i386 Host
Hi, On Fri, 18 Jan 2008, Johannes Schindelin wrote: But apparently it has! With gcc 4 I did never get the error. As I tried to explain, this is pure luck. Maybe. Which probably means that gcc 4 did _not_ use ecx, and therefore it does not have to be pushed and popped. We are talking about the hunk using the q constraint for operand 1 in st[bw]_kernel. The change in the clobber list (and the associated saving/restoring of %ecx around the call) is something entirely different. It cannot be, because just changing the clobber list makes the code compile again! But I'm not talking about the clobber list at all. I reacted to the first mail forwarded to me, which was a question specifically about the hunk adding the q constraint, whose purpose I explained. Are you now also asking about the changes regarding %ecx and the clobber list? Ciao, Michael.
Re: [Qemu-devel] Re: [PATCH 1/5] Fix i386 Host
Hi, On Fri, 18 Jan 2008, Johannes Schindelin wrote: But I'm not talking about the clobber list at all. I reacted to the first mail forwarded to me, which was a question specifically about the hunk adding the q constraint, whose purpose I explained. Are you now also asking about the changes regarding %ecx and the clobber list? Okay, look, I do not have enough time to fix it myself. But there _is_ a breakage with gcc 3.4.2 on MinGW. If that is not fixed, I will openly oppose that patch going into CVS. That's fine and is why I sent a test patch which might fix the breakage when applied on top of the constraint patch. All I tried was to get an understanding why the current patch Fix i386 (which is still misnamed) broke on MinGW. Well, I can tell you why, but it doesn't help you: the 3.4.2 compiler has different deficiencies in reload than the 4.x line of compilers. To make the whole thingy work on all compilers trying and testing is required to avoid all these different deficiencies. My patch is partly real bug fixes (the q constraint part for instance) and partly changes helping to lessen the register pressure which reload needs to fix (the %ecx thingy for instance). Both of these might expose the bugs in 3.4.2, in which case we need to hack around those as well. That's what I tried to do with the patch from http://article.gmane.org/gmane.comp.emulators.qemu/22762 , but it needs of course testing by someone who actually uses 3.4.2. Ciao, Michael.
Re: [Qemu-devel] Re: [PATCH 1/5] Fix i386 Host
Hi, On Fri, 18 Jan 2008, Johannes Schindelin wrote: +#if DATA_SIZE == 1 || DATA_SIZE == 2 + q (v), +#else r (v), +#endif i ((CPU_TLB_SIZE - 1) CPU_TLB_ENTRY_BITS), i (TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS), i (TARGET_PAGE_MASK | (DATA_SIZE - 1)), -- snap -- Michael, Alexander, what is this hunk supposed to do? This is required to generate valid assembler code. Without that hunk, the interesting parts of the asm look like so (for DATA_SIZE == 1): asm ( ... movzbl %b1, %%edx\n ... : : r (blubb), r (bla) ); Okay, but this only concerns gcc4, apparently. No, it's nothing to do with GCC. The instruction itself (movzbl) requires an 8-bit register, so it must be made sure by the constraints the that operand indeed is one of those four. If it also works with r then this is just pure luck (in that GCC chooses one of the four good registes, and not one of the three bad ones allowable with r). Can't we guard it with yet another defined(GCC...)? Only if you want to trust your luck. I fear I don't have gcc 3.4.2 lying around anywhere, so I can't really help debugging this reload breakage in that GCC version. It might help to introduce a temporary to guide GCC through this problematic reload case by detaching the global register variable from the asm operand. For cases where it's no problem this should be optimized away, so doesn't inhibit a performance cost. What I mean is something like the below. If someone with gcc 3.4.2 could test that ... Ciao, Michael. -- --- softmmu_header.h.mm 2008-01-18 14:15:46.0 +0100 +++ softmmu_header.h2008-01-18 14:14:49.0 +0100 @@ -212,6 +212,9 @@ static inline int glue(glue(lds, SUFFIX) static inline void glue(glue(st, SUFFIX), MEMSUFFIX)(target_ulong ptr, RES_TYPE v) { +#if DATA_SIZE == 1 || DATA_SIZE == 2 +RES_TYPE vtmp = v; +#endif asm volatile (movl %0, %%edx\n movl %0, %%eax\n shrl %3, %%edx\n @@ -253,7 +256,7 @@ static inline void glue(glue(st, SUFFIX) /* NOTE: 'q' would be needed as constraint, but we could not use it with T1 ! */ #if DATA_SIZE == 1 || DATA_SIZE == 2 - q (v), + q (vtmp), #else r (v), #endif
Re: [Qemu-devel] [patch] make qemu work with GCC 4
Hi, On Thu, 30 Aug 2007, Thiemo Seufer wrote: * ppc: adds -fno-section-anchors to OP_CFLAGS, as dyngen isn't prepared to deal with the relocs resulting from using section anchors Maybe this should be handled more generally then, not ppc specific, like other offending compiler options: check if the compiler knows the option, if yes, disable the feature. Yeah, like is done for the other additions to OP_CFLAGS, which are guarded by $(call cc-option, ...). I solved that by placing one of the T[012] operands into memory for HOST_I386, thereby freeing one reg. Here's some justification of why that doesn't really cost performance: with three free regs GCC is already spilling like mad in the snippets, we just trade one of those memory accesses (to stack) with one other mem access to the cpu_state structure, which will be in cache. Could you back up this assumption with some numbers? :-) I did that downthread. To me the slowdown is not significant enough, although it exists. --- qemu-0.9.0.cvs.orig/target-arm/cpu.h2007-06-24 14:09:48.0 +0200 +++ qemu-0.9.0.cvs/target-arm/cpu.h 2007-08-21 21:38:36.0 +0200 @@ -52,6 +52,9 @@ typedef uint32_t ARMReadCPFunc(void *opa */ typedef struct CPUARMState { +#if defined(HOST_I386) +uint32_t t1; +#endif /* Regs for current mode. */ uint32_t regs[16]; /* Frequently accessed CPSR bits are stored separately for efficiently. diff -urp qemu-0.9.0.cvs.orig/target-arm/exec.h qemu-0.9.0.cvs/target-arm/exec.h --- qemu-0.9.0.cvs.orig/target-arm/exec.h 2007-06-03 19:44:36.0 +0200 +++ qemu-0.9.0.cvs/target-arm/exec.h2007-08-21 21:48:48.0 +0200 @@ -23,7 +23,12 @@ register struct CPUARMState *env asm(AREG0); register uint32_t T0 asm(AREG1); register uint32_t T1 asm(AREG2); +#ifndef HOST_I386 register uint32_t T2 asm(AREG3); +#else +#define T2 (env-t1) +#endif T2/t1 mismatch, it seems. Likewise for mips and ppc. Yes, well, I could have named the env member perhaps temp :-) I can change it, though. Ciao, Michael.
Re: [Qemu-devel] [patch] make qemu work with GCC 4
Hi, On Wed, 29 Aug 2007, Michael Matz wrote: I solved that by placing one of the T[012] operands into memory for HOST_I386, thereby freeing one reg. Here's some justification of why that doesn't really cost performance: with three free regs GCC is already spilling like mad in the snippets, we just trade one of those memory accesses (to stack) with one other mem access to the cpu_state structure, which will be in cache. Do you have any evidence to support this claim? Not really, only an apple and orange comparison. A 1 iteration tests/sha1 run in the same Linux image, with -no-kqemu, on host and target i386: time ./sha1 with qemu-0.8.2 (compiled by gcc 3.3-hammer): 7.92 seconds with qemu-0.9.0-cvs (gcc4.1 compiled, with the patch): 8.15 seconds I'll try to get a better comparison. So, I've now compared our 0.9.0 package, once without patch compiled by 3.3-hammer, and once with patch and compiled by gcc 4.2: gcc33 compiled: 7.81 seconds (i.e. a bit faster than 0.8.2 was) gcc42 compiled: 8.07 seconds I.e. 3% slower. Ciao, Michael.
Re: [Qemu-devel] [patch] make qemu work with GCC 4
Hi, On Wed, 29 Aug 2007, Johannes Schindelin wrote: The whole patch is against a 0.9.0-cvs version from 2007-07-09 (Alex might know the exact checkout date), so chances are that it still applies :) It is based on the z80 fork, but it applies relatively cleanly (one trailing whitespace) to the version as of Use unsigned 32-bit load for ld/lduw. However, I still get this error: ../dyngen -o op.h op.o dyngen: ret or jmp expected at the end of op_tadd_T1_T0_ccTV make[1]: *** [op.h] Fehler 1 make[1]: Leaving directory `/home/me/qemu/sparc-linux-user' Using SuSE 10.2, i.e. gcc 4.1.2? For qemu CVS (see my other mail) I didn't test linux-user at all. In our qemu package itself (which builds also the linux-user parts) there are some more patches which might fix the above problem, don't know yet. I would try --disable-linux-user in your case. I can look at it later. When only making i386-softmmu, I still get this (on SuSE 10.2): In file included from /home/gene099/my/qemu/usb-linux.c:29: /usr/include/linux/usbdevice_fs.h:49: error: expected ‘:’, ‘,’, ‘;’, ‘}’ or ‘__attribute__’ before ‘*’ token /usr/include/linux/usbdevice_fs.h:56: error: expected ‘:’, ‘,’, ‘;’, ‘}’ or ‘__attribute__’ before ‘*’ token /usr/include/linux/usbdevice_fs.h:66: error: expected ‘:’, ‘,’, ‘;’, ‘}’ or ‘__attribute__’ before ‘*’ token /usr/include/linux/usbdevice_fs.h:100: error: expected ‘:’, ‘,’, ‘;’, ‘}’ or ‘__attribute__’ before ‘*’ token /usr/include/linux/usbdevice_fs.h:116: error: expected ‘:’, ‘,’, ‘;’, ‘}’ or ‘__attribute__’ before ‘*’ token /home/me/qemu/usb-linux.c: In function ‘usb_host_handle_data’: /home/me/qemu/usb-linux.c:130: error: ‘struct usbdevfs_bulktransfer’ has no member named ‘data’ make: *** [usb-linux.o] Fehler 1 Yes, that's a problem of the kernel headers on 10.2. You can work around this with the below snippet. Ciao, Michael. -- Index: usb-linux.c === RCS file: /sources/qemu/qemu/usb-linux.c,v retrieving revision 1.10 diff -u -p -r1.10 usb-linux.c --- usb-linux.c 10 Dec 2006 22:11:04 - 1.10 +++ usb-linux.c 29 Aug 2007 11:45:13 - @@ -26,6 +26,7 @@ #if defined(__linux__) #include dirent.h #include sys/ioctl.h +#define __user #include linux/usbdevice_fs.h #include linux/version.h
Re: [Qemu-devel] [patch] make qemu work with GCC 4
Hi, On Wed, 29 Aug 2007, Johannes Schindelin wrote: Thanks for your effor Michael! Now, I only hope, one of the patches that makes qemu gcc4 compliant are soon merged. Well, to throw a spanner in the works: this patch is the 4th patch along the lines that I came about. None of them (AFAICT) was tested well enough to be included in CVS. Indeed, the biggest problem seems to be to make a patch that not only works on the machine of the poster, but on other machines, too. Please? qemu with that patch builds on four architectures, and tests fine on two of them (including cross targets) and only because I haven't run it on the other two architectures yet. It might or might not need more patches than just mine (as I started from our package), but in that case they are independend of making qemu work with gcc 4, which is the only thing my patch is concerned about. Ciao, Michael.
Re: [Qemu-devel] [patch] make qemu work with GCC 4
Hi, On Wed, 29 Aug 2007, Paul Brook wrote: I solved that by placing one of the T[012] operands into memory for HOST_I386, thereby freeing one reg. Here's some justification of why that doesn't really cost performance: with three free regs GCC is already spilling like mad in the snippets, we just trade one of those memory accesses (to stack) with one other mem access to the cpu_state structure, which will be in cache. Do you have any evidence to support this claim? Not really, only an apple and orange comparison. A 1 iteration tests/sha1 run in the same Linux image, with -no-kqemu, on host and target i386: time ./sha1 with qemu-0.8.2 (compiled by gcc 3.3-hammer): 7.92 seconds with qemu-0.9.0-cvs (gcc4.1 compiled, with the patch): 8.15 seconds I'll try to get a better comparison. Note, though, that this is the only easy solution. Any other solution would either involve improving reload pretty much, or rewriting many of the op.c patterns (for all targets) to never require more than three registers, and ensure that gcc doesn't become clever and mashes some insns together again (and in trying to do that probably slow down the snippets again). Or doing away with dyngen at all. All three solutions are fairly involved, so I'm personally fine with the above slowdown (for i386 targets you won't normally get any noticable slowdown as you'd use kqemu). Last time I did this it caused a significant performance hit. I'd guess that most common ops are simple enough that we don't need more than 3 registers. For i386 target I'm redefining only T1 (as I figured that A0 and T0 are used a bit more), it might be that some of the code could be generated in a way to not make use of T1 too much, I haven't really poked at that. --- qemu-0.9.0.cvs.orig/softmmu_header.h - : %eax, %ecx, %edx, memory, cc); + : %eax, %edx, memory, cc); This change is wrong. The inline asm calls C code which clobbers %ecx. Indeed, must have been blind. Okay these are too many clobbers for poor gcc4 nevertheless, so a push/pop %ecx around the call needs to be done. Courious that this didn't lead to fire all over the place. See patch below. Ciao, Michael. Index: softmmu_header.h === RCS file: /sources/qemu/qemu/softmmu_header.h,v retrieving revision 1.15 diff -u -p -r1.15 softmmu_header.h --- softmmu_header.h23 May 2007 19:58:10 - 1.15 +++ softmmu_header.h29 Aug 2007 17:27:37 - @@ -230,9 +230,11 @@ static inline void glue(glue(st, SUFFIX) #else #error unsupported size #endif + pushl %%ecx\n pushl %6\n call %7\n popl %%eax\n + popl %%ecx\n jmp 2f\n 1:\n addl 8(%%edx), %%eax\n @@ -250,14 +252,18 @@ static inline void glue(glue(st, SUFFIX) : r (ptr), /* NOTE: 'q' would be needed as constraint, but we could not use it with T1 ! */ +#if DATA_SIZE == 1 || DATA_SIZE == 2 + q (v), +#else r (v), +#endif i ((CPU_TLB_SIZE - 1) CPU_TLB_ENTRY_BITS), i (TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS), i (TARGET_PAGE_MASK | (DATA_SIZE - 1)), m (*(uint32_t *)offsetof(CPUState, tlb_table[CPU_MEM_INDEX][0].addr_write)), i (CPU_MEM_INDEX), m (*(uint8_t *)glue(glue(__st, SUFFIX), MMUSUFFIX)) - : %eax, %ecx, %edx, memory, cc); + : %eax, %edx, memory, cc); } #else
[Qemu-devel] [patch] make qemu work with GCC 4
Hi, [please keep me CCed, I'm not on this list] the below patch let's qemu be compiled by GCC 4.2 (probably also 4.1 and others) for most hosts (i386,x86_64,ia64,ppc). s390 as host is missing, and needs a compiler change to emit the literal store inline again, as the literal pool at the end fundamentally breaks the assumption that qemu can paste together the code snippets by patching out the return. I have no HOST_{ARM,MIPS*,ALPHA,SPARC*,M68K} machines to compile for that. It specifically changes these things: * ppc: adds -fno-section-anchors to OP_CFLAGS, as dyngen isn't prepared to deal with the relocs resulting from using section anchors * ppc: on target-alpha op_reset_FT GCC4 uses a floating point constant 0.0 to reset the ft regs, which in turn is loaded from the data section. The reloc for that is unhandled. Using -ffast-math would work around this, but I chose to be conservative and change only the op.c snippet in question. See the comment there. * i386: well, most of you will know that GCC4 doesn't compile qemu because of reload. The inherent problem is, that qemu uses 64bit entities in some places (sometimes structs), which GCC (4.x) manages to place in registers, i.e. needs 2 hardregs. But it sometimes just so happens that an instruction needing such DImode reg also has a memory operand with an indexed address (reg plus reg), hence two hardregs more. But qemu by default leaves just three free registers for compiling op.c -- boom. This is somewhat hard to work around in GCC (trust me :) ). I solved that by placing one of the T[012] operands into memory for HOST_I386, thereby freeing one reg. Here's some justification of why that doesn't really cost performance: with three free regs GCC is already spilling like mad in the snippets, we just trade one of those memory accesses (to stack) with one other mem access to the cpu_state structure, which will be in cache. Additionally I made sure that I put the least used Tx global into memory. I haven't done much performance measurements but noticed no glaring problems. Two more obvious changes in an inline asm in softmmu_header.h were necessary too. A qemu with that patch was tested on HOST_I386 with all images from the qemu homepage (i.e. arm, mips, mipsel, coldfire (that doesn't work) and sparc), some i386 boot isos and a freedos image. It also was tested on HOST_X86_64 installing openSUSE beta-something for i386 and x86_64. I haven't yet tested it on ia64 or ppc hosts. The whole patch is against a 0.9.0-cvs version from 2007-07-09 (Alex might know the exact checkout date), so chances are that it still applies :) Ciao, Michael. diff -urp qemu-0.9.0.cvs.orig/softmmu_header.h qemu-0.9.0.cvs/softmmu_header.h --- qemu-0.9.0.cvs.orig/softmmu_header.h2007-08-21 18:58:00.0 +0200 +++ qemu-0.9.0.cvs/softmmu_header.h 2007-08-21 20:40:23.0 +0200 @@ -254,14 +254,18 @@ static inline void glue(glue(st, SUFFIX) : r (ptr), /* NOTE: 'q' would be needed as constraint, but we could not use it with T1 ! */ +#if DATA_SIZE == 1 || DATA_SIZE == 2 + q (v), +#else r (v), +#endif i ((CPU_TLB_SIZE - 1) CPU_TLB_ENTRY_BITS), i (TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS), i (TARGET_PAGE_MASK | (DATA_SIZE - 1)), m (*(uint32_t *)offsetof(CPUState, tlb_table[CPU_MEM_INDEX][0].addr_write)), i (CPU_MEM_INDEX), m (*(uint8_t *)glue(glue(__st, SUFFIX), MMUSUFFIX)) - : %eax, %ecx, %edx, memory, cc); + : %eax, %edx, memory, cc); } #else diff -urp qemu-0.9.0.cvs.orig/target-alpha/cpu.h qemu-0.9.0.cvs/target-alpha/cpu.h --- qemu-0.9.0.cvs.orig/target-alpha/cpu.h 2007-06-03 23:02:37.0 +0200 +++ qemu-0.9.0.cvs/target-alpha/cpu.h 2007-08-22 00:54:15.0 +0200 @@ -278,6 +278,8 @@ struct CPUAlphaState { * used to emulate 64 bits target on 32 bits hosts */ target_ulong t0, t1, t2; +#elif defined(HOST_I386) +target_ulong t2; #endif /* */ double ft0, ft1, ft2; diff -urp qemu-0.9.0.cvs.orig/target-alpha/exec.h qemu-0.9.0.cvs/target-alpha/exec.h --- qemu-0.9.0.cvs.orig/target-alpha/exec.h 2007-06-03 19:44:36.0 +0200 +++ qemu-0.9.0.cvs/target-alpha/exec.h 2007-08-21 21:28:58.0 +0200 @@ -40,7 +40,11 @@ register struct CPUAlphaState *env asm(A register uint64_t T0 asm(AREG1); register uint64_t T1 asm(AREG2); +#ifndef HOST_I386 register uint64_t T2 asm(AREG3); +#else +#define T2 (env-t2) +#endif #endif /* TARGET_LONG_BITS HOST_LONG_BITS */ diff -urp qemu-0.9.0.cvs.orig/target-arm/cpu.h qemu-0.9.0.cvs/target-arm/cpu.h --- qemu-0.9.0.cvs.orig/target-arm/cpu.h