Re: Disabling TLS address caching to help QEMU on GNU/Linux

2021-07-22 Thread Michael Matz
Hello,

On Thu, 22 Jul 2021, Richard Biener via Gcc wrote:

> But how does TLS usage transfer between threads?  On the gimple level 
> the TLS pointer is not visible and thus we'd happily CSE its address:

Yes.  All take-address operations then need to be encoded explicitely with 
a non-CSE-able internal function (or so):

   --> __ifn_get_tls_addr();

( in the argument just so that it's clear that it doesn't access the 
value at x and to get the current effects of address-taken marking of 
ADDR_EXPR).

(Or of course, ADDR_EXPR could be taken as unstable when applied to TLS 
decls).

Quite a big hammer IMHO.


Ciao,
Michael.



[Qemu-devel] [PATCH v2] ppc: Fix size of ppc64 xer register

2018-02-26 Thread Michael Matz
The normal gdb definition of the XER registers is only 32 bit,
and that's what the current version of power64-core.xml also
says (seems copied from gdb's).  But qemu's idea of the XER register
is target_ulong (in CPUPPCState, ppc_gdb_register_len and
ppc_cpu_gdb_read_register)

That mismatch leads to the following message when attaching
with gdb:

  Truncated register 32 in remote 'g' packet

(and following on that qemu stops responding).  The simple fix is
to say the truth in the .xml file.  But the better fix is to
actually make it 32bit on the wire, as old gdbs don't support
XML files for describing registers.  Also the XER state in qemu
doesn't seem to use the high 32 bits, so sending it off to gdb
doesn't seem worthwhile.

[v2: fix formatting in moved line and adjust others]

Signed-off-by: Michael Matz <m...@suse.de>
---
 target/ppc/gdbstub.c | 26 +-
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/target/ppc/gdbstub.c b/target/ppc/gdbstub.c
index 7a33813..688749d 100644
--- a/target/ppc/gdbstub.c
+++ b/target/ppc/gdbstub.c
@@ -33,14 +33,14 @@ static int ppc_gdb_register_len_apple(int n)
 return 8;
 case 64 ... 95:
 return 16;
-case 64+32: /* nip */
-case 65+32: /* msr */
-case 67+32: /* lr */
-case 68+32: /* ctr */
-case 69+32: /* xer */
-case 70+32: /* fpscr */
+case 64 + 32: /* nip */
+case 65 + 32: /* msr */
+case 67 + 32: /* lr */
+case 68 + 32: /* ctr */
+case 70 + 32: /* fpscr */
 return 8;
-case 66+32: /* cr */
+case 66 + 32: /* cr */
+case 69 + 32: /* xer */
 return 4;
 default:
 return 0;
@@ -61,6 +61,8 @@ static int ppc_gdb_register_len(int n)
 return 8;
 case 66:
 /* cr */
+case 69:
+/* xer */
 return 4;
 case 64:
 /* nip */
@@ -70,8 +72,6 @@ static int ppc_gdb_register_len(int n)
 /* lr */
 case 68:
 /* ctr */
-case 69:
-/* xer */
 return sizeof(target_ulong);
 case 70:
 /* fpscr */
@@ -152,7 +152,7 @@ int ppc_cpu_gdb_read_register(CPUState *cs, uint8_t 
*mem_buf, int n)
 gdb_get_regl(mem_buf, env->ctr);
 break;
 case 69:
-gdb_get_regl(mem_buf, env->xer);
+gdb_get_reg32(mem_buf, env->xer);
 break;
 case 70:
 gdb_get_reg32(mem_buf, env->fpscr);
@@ -208,7 +208,7 @@ int ppc_cpu_gdb_read_register_apple(CPUState *cs, uint8_t 
*mem_buf, int n)
 gdb_get_reg64(mem_buf, env->ctr);
 break;
 case 69 + 32:
-gdb_get_reg64(mem_buf, env->xer);
+gdb_get_reg32(mem_buf, env->xer);
 break;
 case 70 + 32:
 gdb_get_reg64(mem_buf, env->fpscr);
@@ -259,7 +259,7 @@ int ppc_cpu_gdb_write_register(CPUState *cs, uint8_t 
*mem_buf, int n)
 env->ctr = ldtul_p(mem_buf);
 break;
 case 69:
-env->xer = ldtul_p(mem_buf);
+env->xer = ldl_p(mem_buf);
 break;
 case 70:
 /* fpscr */
@@ -309,7 +309,7 @@ int ppc_cpu_gdb_write_register_apple(CPUState *cs, uint8_t 
*mem_buf, int n)
 env->ctr = ldq_p(mem_buf);
 break;
 case 69 + 32:
-env->xer = ldq_p(mem_buf);
+env->xer = ldl_p(mem_buf);
 break;
 case 70 + 32:
 /* fpscr */
-- 
1.8.1.4




[Qemu-devel] [PATCH] ppc: Fix size of ppc64 xer register (fwd)

2018-02-23 Thread Michael Matz
The normal gdb definition of the XER registers is only 32 bit,
and that's what the current version of power64-core.xml also
says (seems copied from gdb's).  But qemu's idea of the XER register
is target_ulong (in CPUPPCState, ppc_gdb_register_len and
ppc_cpu_gdb_read_register)

That mismatch leads to the following message when attaching
with gdb:

  Truncated register 32 in remote 'g' packet

(and following on that qemu stops responding).  The simple fix is
to say the truth in the .xml file.  But the better fix is to
actually make it 32bit on the wire, as old gdbs don't support
XML files for describing registers.  Also the XER state in qemu
doesn't seem to use the high 32 bits, so sending it off to gdb
doesn't seem worthwhile.

Signed-off-by: Michael Matz <m...@suse.de>
---
 target/ppc/gdbstub.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/target/ppc/gdbstub.c b/target/ppc/gdbstub.c
index 7a33813..b6f6693 100644
--- a/target/ppc/gdbstub.c
+++ b/target/ppc/gdbstub.c
@@ -37,10 +37,10 @@ static int ppc_gdb_register_len_apple(int n)
 case 65+32: /* msr */
 case 67+32: /* lr */
 case 68+32: /* ctr */
-case 69+32: /* xer */
 case 70+32: /* fpscr */
 return 8;
 case 66+32: /* cr */
+case 69+32: /* xer */
 return 4;
 default:
 return 0;
@@ -61,6 +61,8 @@ static int ppc_gdb_register_len(int n)
 return 8;
 case 66:
 /* cr */
+case 69:
+/* xer */
 return 4;
 case 64:
 /* nip */
@@ -70,8 +72,6 @@ static int ppc_gdb_register_len(int n)
 /* lr */
 case 68:
 /* ctr */
-case 69:
-/* xer */
 return sizeof(target_ulong);
 case 70:
 /* fpscr */
@@ -152,7 +152,7 @@ int ppc_cpu_gdb_read_register(CPUState *cs, uint8_t 
*mem_buf, int n)
 gdb_get_regl(mem_buf, env->ctr);
 break;
 case 69:
-gdb_get_regl(mem_buf, env->xer);
+gdb_get_reg32(mem_buf, env->xer);
 break;
 case 70:
 gdb_get_reg32(mem_buf, env->fpscr);
@@ -208,7 +208,7 @@ int ppc_cpu_gdb_read_register_apple(CPUState *cs, uint8_t 
*mem_buf, int n)
 gdb_get_reg64(mem_buf, env->ctr);
 break;
 case 69 + 32:
-gdb_get_reg64(mem_buf, env->xer);
+gdb_get_reg32(mem_buf, env->xer);
 break;
 case 70 + 32:
 gdb_get_reg64(mem_buf, env->fpscr);
@@ -259,7 +259,7 @@ int ppc_cpu_gdb_write_register(CPUState *cs, uint8_t 
*mem_buf, int n)
 env->ctr = ldtul_p(mem_buf);
 break;
 case 69:
-env->xer = ldtul_p(mem_buf);
+env->xer = ldl_p(mem_buf);
 break;
 case 70:
 /* fpscr */
@@ -309,7 +309,7 @@ int ppc_cpu_gdb_write_register_apple(CPUState *cs, uint8_t 
*mem_buf, int n)
 env->ctr = ldq_p(mem_buf);
 break;
 case 69 + 32:
-env->xer = ldq_p(mem_buf);
+env->xer = ldl_p(mem_buf);
 break;
 case 70 + 32:
 /* fpscr */
-- 
1.8.1.4




Re: [Qemu-devel] [PATCH 16/16] exec-all.h: Increase MAX_OP_PER_INSTR for ARM A64 decoder

2014-03-10 Thread Michael Matz
Hi,

On Sun, 9 Mar 2014, Peter Maydell wrote:

 diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
 index 6a3597d..2435c95 100644
 --- a/target-arm/translate-a64.c
 +++ b/target-arm/translate-a64.c
 @@ -9007,9 +9007,8 @@ static void disas_simd_two_reg_misc(DisasContext *s, 
 uint32_t insn)
  case 0x19: /* FRINTM */
  case 0x38: /* FRINTP */
  case 0x39: /* FRINTZ */
 -case 0x58: /* FRINTA */
  need_rmode = true;
 -rmode = extract32(opcode, 5, 2) | (extract32(opcode, 0, 1)  1);
 +rmode = extract32(opcode, 5, 1) | (extract32(opcode, 0, 1)  1);
  /* fall through */
  case 0x59: /* FRINTX */
  case 0x79: /* FRINTI */
 @@ -9019,6 +9018,15 @@ static void disas_simd_two_reg_misc(DisasContext *s, 
 uint32_t insn)
  return;
  }
  break;
 +case 0x58: /* FRINTA */
 +need_rmode = true;
 +rmode = FPROUNDING_TIEAWAY;
 +need_fpstatus = true;
 +if (size == 3  !is_q) {
 +unallocated_encoding(s);
 +return;
 +}
 +break;

Merge the above into [15/16] ?


Ciao,
Michael.



Re: [Qemu-devel] Call for testing QEMU aarch64-linux-user emulation

2014-03-10 Thread Michael Matz
Hi,

On Mon, 10 Mar 2014, Alex Bennée wrote:

  Not sure there's much point looking very deeply into this. Java 
  programs are threaded, threaded programs don't work under QEMU = 
  don't try to run Java under QEMU :-)
 
 Having said that I'm sure there was another SIGILL related crash on 
 Launchpad and I think we would be interested in those. Is JAVA really 
 that buggy under QEMU just because of threading?

Generally speaking, yes.  I've never seen problems with openjdk (with the 
suse tree), so the segfault above might be also be related to the segfault 
handling for read-only data segments containing code (the signal 
trampoline on stack), for which the patches were recently proposed 
upstream.


Ciao,
Michael.

Re: [Qemu-devel] Call for testing QEMU aarch64-linux-user emulation

2014-02-27 Thread Michael Matz
Hi,

On Wed, 26 Feb 2014, Dann Frazier wrote:

 I've narrowed down the changes that seem to prevent both types of
 segfaults to the following changes that introduce a wrapper around
 sigprocmask:
 
 https://github.com/susematz/qemu/commit/f1542ae9fe10d5a241fc2624ecaef5f0948e3472
 https://github.com/susematz/qemu/commit/4e5e1607758841c760cda4652b0ee7a6bc6eb79d
 https://github.com/susematz/qemu/commit/63eb8d3ea58f58d5857153b0c632def1bbd05781
 
 I'm not sure if this is a real fix or just papering over my issue -

It's fixing the issue, but strictly speaking introduces an QoI problem. 
SIGSEGV must not be controllable by the guest, it needs to be always 
deliverable to qemu; that is what's fixed.

The QoI problem introduced is that with the implementation as is, the 
fiddling with SIGSEGV is detectable by the guest.  E.g. if it installs a 
segv handler, blocks segv, then forces a segfault, checks that it didn't 
arrive, then unblocks segv and checks that it now arrives, such testcase 
would be able to detect that in fact it couldn't block SIGSEGV.

Luckily for us, the effect of blocking SIGSEGV and then generating one in 
other ways than kill/sigqueue/raise (e.g. by writing to NULL) are 
undefined, so in practice that QoI issue doesn't matter.

To fix also that latter part it'd need a further per-thread flag 
segv_blocked_p which needs to be checked before actually delivering a 
guest-directed SIGSEGV (in comparison to a qemu-directed SEGV), and 
otherwise requeue it.  That's made a bit complicated when the SEGV was 
process-directed (not thread-directed) because in that case it needs to be 
delivered as long as there's _any_ thread which has it unblocked.  So 
given the above undefinedness for sane uses of SEGVs it didn't seem worth 
the complication of having an undetectable virtualization of SIGSEGV.

 but either way, are these changes reasonable for upstream submission?

IIRC the first two commits (from Alex Barcelo) were submitted once in the 
past, but fell through the cracks.


Ciao,
Michael.



Re: [Qemu-devel] Call for testing QEMU aarch64-linux-user emulation

2014-02-25 Thread Michael Matz
Hi,

On Tue, 25 Feb 2014, Andreas Färber wrote:

  There are some pretty large differences between these trees with 
  respect to signal syscalls - is that the likely culprit?
  
  Quite likely. We explicitly concentrated on the arch64 specific 
  instruction emulation leaving more generic patches to flow in from 
  SUSE as they matured.
  
  I guess it's time to go through the remaining patches and see what's 
  up-streamable.
  
  Alex/Michael,
  
  Are any of these patches in flight now?
 
 I don't think so, Alex seems to hate cleaning that stuff up... :P
 
 Compare https://github.com/openSUSE/qemu/commits/opensuse-1.7 for our
 general queue. We have patches adding locking to TCG, and there's a hack
 pinning the CPU somewhere.

The locking and pinning is all wrong (resp. overbroad).  The aarch64-1.6 
branch contains better implementations for that and some actual fixes for 
aarch64' userspace.

Somehow I don't find the time to go through our patches in linux-user and 
submit them.  The biggest road-block is that signal vs syscall handling is 
fundamentally broken in linux-user and it's unfixable without 
assembler implementations of the syscall caller.  That is also still 
broken on the suse branch where I tried various ways to fix that until 
coming to that conclusion.


Ciao,
Michael.

Re: [Qemu-devel] Call for testing QEMU aarch64-linux-user emulation

2014-02-25 Thread Michael Matz
Hi,

On Tue, 25 Feb 2014, Peter Maydell wrote:

 On 25 February 2014 13:33, Michael Matz m...@suse.de wrote
  The biggest road-block is that signal vs syscall handling is
  fundamentally broken in linux-user and it's unfixable without
  assembler implementations of the syscall caller.
 
 I'm not entirely sure it's possible to fix even with
 hand-rolled assembly, to be honest.

I am fairly sure.  The problem is simply to detect if the signal arrived 
while inside the kernel (doing the syscalls job) or still or already 
outside. This structure helps with that:

before:
setup args and stuff for syscall to do
atsys:
syscall insn (single insn!)
after:
mov return, return-register-per-psABI
realafter:
rest of stuff

When a signal arrives you look at the return address the kernel puts into 
the siginfo.  Several cases:

* before = retaddr  atsys:
  syscall hasn't yet started, so break syscall sequence, handle signal in 
  main loop, redo the syscall.
* atsys == retaddr
  syscall has started and the kernel wants to restart it after sighandler
  returns, _or_ syscall was just about to be started.  No matter what,
  the right thing to do is to actually do the syscall (again) after 
  handling the signal.  So break syscall sequence, handle signal in main
  loop, (re)do the syscall.
* after = retaddr  realafter:
  syscall is complete but return value not yet in some variable but still 
  in register (or other post-syscall work that still needs doing isn't
  complete yet); nothing interesting to do, just let it continue with the 
  syscall sequence, handle signal in main loop after that one returned.
* retaddr any other value:
  uninteresting; actually I'm not sure we'd need the distinction between 
  after and realafter.  Handle signal as usual in main loop.

The important thing for qemu is to know precisely if the signal arrived 
before the syscall was started (or is to be restarted), or after it 
returned, and for that the compiler must not be allowed to insert any code 
between atsys and after.

 However there are a bunch of bugfixes in your tree
 which it would be really nice to see upstreamed:
 the sendmmsg patch, for instance. We can at least
 get the aarch64 support to the same level as the
 32 bit arm linux-user setup, which is genuinely
 useful to people despite the well known races and
 locking issues.

Yeah.


Ciao,
Michael.



Re: [Qemu-devel] [PATCH v3 0/5] disas: add libvixl to support A64 disassembly

2014-02-06 Thread Michael Matz
Hi,

On Wed, 5 Feb 2014, Peter Maydell wrote:

 Changes v2 - v3:
  * added a README clarifying that libvixl's disassembly support
is not complete and that contributions should go to libvixl
upstream first for preference

Hmm, why aren't we simply using the binutils disassembler?  It's also 
(C) by ARM, so there shouldn't be any relicensing problems.  And it 
does support AdvSIMD and system instructions.


Ciao,
Michael.



Re: [Qemu-devel] [PATCH v3 0/5] disas: add libvixl to support A64 disassembly

2014-02-06 Thread Michael Matz
Hi,

On Thu, 6 Feb 2014, Peter Maydell wrote:

 On 6 February 2014 13:45, Michael Matz m...@suse.de wrote:
  Hmm, why aren't we simply using the binutils disassembler?  It's also
  (C) by ARM, so there shouldn't be any relicensing problems.  And it
  does support AdvSIMD and system instructions.
 
 It is GPLv3 which is not compatible with GPLv2 which
 QEMU requires. The contribution process for binutils
 involves a copyright assignment which means the FSF
 now have the copyright there, as I understand it.

The FSF always grants back rights on the contribution to the contributor.  
ARM could simply double-license their original contribution of the 
disassembler.


Ciao,
Michael.



Re: [Qemu-devel] [PATCH v3 0/5] disas: add libvixl to support A64 disassembly

2014-02-06 Thread Michael Matz
Hi,

On Thu, 6 Feb 2014, Christopher Covington wrote:

  The FSF always grants back rights on the contribution to the 
  contributor.  ARM could simply double-license their original 
  contribution of the disassembler.
 
 Is dual licensing always possible given a grant-back? What if the 
 contribution is a derivative of a GPL-3.0 licensed work?

Then it's not.  I don't think that's the case with ARMs aarch64 disasm, 
but ultimately only ARM can say, and Peter already hinted that this all 
isn't possible.  Which is a pity, duplicate work, inferior capability (at 
least for now), and such ...


Ciao,
Michael.



Re: [Qemu-devel] [PATCH 26/60] AArch64: Add ADR instruction emulation

2013-11-20 Thread Michael Matz
Hi,

On Tue, 19 Nov 2013, Claudio Fontana wrote:

  +uint64_t imm;
  +uint64_t base;
  +
  +imm = get_sbits(insn, 5, 19)  2;
  +imm |= get_bits(insn, 29, 2);
 
 does this work with negative values?

Yes.  get_sbits returns a sign extended (32bit) int, the shift doesn't 
change that, the conversion to uint64_t first sign extends to 64bit and 
then converts to unsigned (conceptually).  From then on it's an unsigned 
value (with high bits set when input was negative), but two complement 
arithmetic makes that irrelevant.


Ciao,
Michael.



Re: [Qemu-devel] [PATCH 50/60] AArch64: Add Floating-point-fixed-point

2013-11-20 Thread Michael Matz
Hi,

On Tue, 19 Nov 2013, Janne Grunau wrote:

  +static void handle_fpfpconv(DisasContext *s, uint32_t insn)
  +{
  +int opcode = get_bits(insn, 16, 3);
  +int rmode = get_bits(insn, 20, 2);
 
 rmode is at 19
 
  +case 0x1: /* [S|U]CVTF (scalar-float) */
 
 and it's case 0x0: for [S|U]CVTF

Both were fixed after Alex' series with 0dd22d0c:
https://github.com/susematz/qemu/commit/0dd22d0c5cd1dcdccd5df953f1981d461d3054e5


Ciao,
Michael.



Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation

2013-11-18 Thread Michael Matz
Hi,

On Mon, 18 Nov 2013, Claudio Fontana wrote:

  +case 3:
  +tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift);
  +break;
  
  Incorrect rotate for 32bit?

32bit rotates and shifts were fixed in a patch later than the 60er series 
Alex posted.  See attached.  (Generally there are many fixes to emulated 
instructions in that branch)

  +if (!shift_amount  source == 0x1f) {
 
 Besides the comment, is this correct?

No, it needs to check for opc == 1.

  +tcg_dest = cpu_reg(dest);
  +switch (opc) {
  +case 0x0:
  +case 0x3:
  +tcg_gen_and_i64(tcg_dest, cpu_reg(source), tcg_op2);
  +break;
  +case 0x1:
  +tcg_gen_or_i64(tcg_dest, cpu_reg(source), tcg_op2);
  +break;
  +case 0x2:
  +tcg_gen_xor_i64(tcg_dest, cpu_reg(source), tcg_op2);
  +break;
  +}
  +
  +if (is_32bit) {
  +tcg_gen_ext32u_i64(tcg_dest, tcg_dest);
  +}
  +
  +if (setflags) {
  +gen_helper_pstate_add(pstate, pstate, tcg_dest, cpu_reg(31), 
  tcg_dest);
  +}
  
  Incorrect flags generated.  They're different between add/sub and logical.
  In particular, C and V are always zero.

That's done correctly with the fixed pstate helpers coming with a later 
patch (see attached as well).  reg31 is zero, so that's flags as if for 
dest == dest + 0, and PSTATE_C and PSTATE_V will be zero.  That is, the 
logical flags are the same as the arithmetic flags for result plus zero 
with no carry_in.


Ciao,
Michael.From df54486da31d6329696effa61096eda5ab85395a Mon Sep 17 00:00:00 2001
From: Michael Matz m...@suse.de
Date: Sun, 24 Mar 2013 02:52:42 +0100
Subject: [PATCH] Fix 32bit rotates.

The 32bit shifts generally weren't careful with the upper parts,
either bits could leak in (for right shift) or leak or (for left shift).
And rotate was completely off, rotating around bit 63, not 31.
This fixes the CAST5 hash algorithm.
---
 target-arm/translate-a64.c | 30 +++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 96dc281..e3941a1 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -596,25 +596,49 @@ static TCGv_i64 get_shift(int reg, int shift_type, TCGv_i64 tcg_shift,
 r = tcg_temp_new_i64();
 
 /* XXX carry_out */
+/* Careful with the width.  We work on 64bit, but must make sure
+   that we zero-extend the result on out, and ignore any upper bits,
+   that might still be set in that register.  */
 switch (shift_type) {
 case 0: /* LSL */
+	/* left shift is easy, simply zero-extend on out */
 tcg_gen_shl_i64(r, cpu_reg(reg), tcg_shift);
+	if (is_32bit)
+	  tcg_gen_ext32u_i64 (r, r);
 break;
 case 1: /* LSR */
-tcg_gen_shr_i64(r, cpu_reg(reg), tcg_shift);
+	/* For logical right shift we zero extend first, to zero
+	   the upper bits.  We don't need to extend on out.  */
+	if (is_32bit) {
+	tcg_gen_ext32u_i64 (r, cpu_reg(reg));
+	tcg_gen_shr_i64 (r, r, tcg_shift);
+	} else
+	  tcg_gen_shr_i64(r, cpu_reg(reg), tcg_shift);
 break;
 case 2: /* ASR */
+	/* For arithmetic right shift we sign extend first, then shift,
+	   and then need to clear the upper bits again.  */
 if (is_32bit) {
 TCGv_i64 tcg_tmp = tcg_temp_new_i64();
 tcg_gen_ext32s_i64(tcg_tmp, cpu_reg(reg));
 tcg_gen_sar_i64(r, tcg_tmp, tcg_shift);
+	tcg_gen_ext32u_i64 (r, r);
 tcg_temp_free_i64(tcg_tmp);
 } else {
 tcg_gen_sar_i64(r, cpu_reg(reg), tcg_shift);
 }
 break;
-case 3:
-tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift);
+case 3: /* ROR */
+	/* For rotation extending doesn't help, we really have to use
+	   a 32bit rotate.  */
+	if (is_32bit) {
+	TCGv_i32 tmp = tcg_temp_new_i32();
+tcg_gen_trunc_i64_i32(tmp, cpu_reg(reg));
+	tcg_gen_rotr_i32(tmp, tmp, tcg_shift);
+tcg_gen_extu_i32_i64(r, tmp);
+tcg_temp_free_i32(tmp);
+	} else
+	  tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift);
 break;
 }
 
-- 
1.8.1.4

From 33137f8a660750d7d8598c7e467f4ccc8dc5ef85 Mon Sep 17 00:00:00 2001
From: Michael Matz m...@suse.de
Date: Sat, 23 Mar 2013 04:53:44 +0100
Subject: [PATCH] Fix the pstate flags helpers

ADCS and SBCS/SUBS sometimes gave the wrong results
for the C and V flags.  This fixes it.
---
 target-arm/helper-a64.c | 52 -
 1 file changed, 12 insertions(+), 40 deletions(-)

diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c
index 4375bf0..4fcb09b 100644
--- a/target-arm/helper-a64.c
+++ b/target-arm/helper-a64.c
@@ -7,8 +7,6 @@
 
 uint32_t HELPER(pstate_add)(uint32_t pstate, uint64_t a1, uint64_t a2, uint64_t ar)
 {
-int64_t s1 = a1;
-int64_t s2 = a2;
 int64_t sr = ar;
 
 pstate = ~(PSTATE_N | PSTATE_Z | PSTATE_C | PSTATE_V);
@@ -21,11 +19,15

Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation

2013-11-18 Thread Michael Matz
Hi,

On Mon, 18 Nov 2013, Peter Maydell wrote:

   +case 3:
   +tcg_gen_rotr_i64(r, cpu_reg(reg), tcg_shift);
   +break;
  
   Incorrect rotate for 32bit?
 
  32bit rotates and shifts were fixed in a patch later than the 60er series
  Alex posted.  See attached.  (Generally there are many fixes to emulated
  instructions in that branch)
 
 I think we're going to need to look through and fold in those fixes, 
 otherwise we'll end up reduplicating that work in the course of code 
 review :-(

Most probably.  Authorship will be lost :-/ I was planning to submit all 
of these once the 60er set of Alex would be applied.  If you're reworking 
that set more of less completely then it indeed makes more sense to fold 
in those things and so it'd probably be better to have them applied (i.e. 
basically have our full branch applied when dissecting things).

The commits that fix things in the a64 decoder proper (not the linux-user 
implementation) would be:

e14c1a5 softfloat: correctly handle overflow in float[32|64] to uint64 
conversion
cbc98b1 aarch64: Fix FCVTZU for single float
a91f762 aarch64: Fix UZP/ZIP/TRN
644c748 aarch64: Fix 32bit TST
2a717e8 Fix FCVTAS and FCVTAU
0dd22d0 Fix decoding of floating-fixed conversions
d52c999 Fix implementation of USHLL/SSHLL
cfbb9e1 Fix using uninitialized value
ecfdfcd Fix typo in FSUB detection
87fd8ca Increase MAX_OP_PER_INSTR
38452d8 Fix USHLL, and implement other SIMD shifts
4146d40 Fix INS element
a62437c Fix fcmp(e) with NaNs
ec2b8f3 softfloat: Fix float64_to_uint64
b003867 Fix EXTR for 32bit
df54486 Fix 32bit rotates.
33137f8 Fix the pstate flags helpers
75cb838 Don't set flush to zero by default
564e811 Fix 128bit ldr (literal)
0ff91a0 Fix long immediate constants

(That's all on top Alex' posted patchset but not git rebased onto it, top 
of Alex roughly corresponds to commit 40d66b61)


Ciao,
Michael.



Re: [Qemu-devel] [PATCH 14/60] AArch64: Add orr instruction emulation

2013-11-18 Thread Michael Matz
Hi,

On Mon, 18 Nov 2013, Claudio Fontana wrote:

  +tcg_gen_trunc_i64_i32(tmp, cpu_reg(reg));
  +   tcg_gen_rotr_i32(tmp, tmp, tcg_shift);
 
 Isn't this problematic?
 We are using gen_rotr_i32, but passing tcg_shift, which is a TCGv_i64.

With CONFIG_DEBUG_TCG it'll break, yes.  Though in principle there's no 
canonical relation between the two argument types for shifts and rotates 
(unlike addition for example) TCG indeed wants to ensure that 
typeof(arg2)==typeof(arg1).

 I remember I had compilation failures in the past when I tried something 
 similar, so my understanding is that this can work with a certain 
 compiler under certain compiler options, but is not guaranteed to work 
 in all cases.
 
 I think we need to either explicitly convert the tcg_shift to a 
 TCGv_i32, or we need to use an open coded version of the rotr_i64 that 
 inserts at (32 - n) instead of (64 - n)
 
 What do you think?

I think converting tcg_shift might eventually lead to better generated 
code (if tcg is optmizing enough, now or in the future, haven't checked).


Ciao,
Michael.



Re: [Qemu-devel] Checking the state of arm64-linux-user

2013-10-30 Thread Michael Matz
Hi,

On Tue, 29 Oct 2013, Peter Maydell wrote:

 On 29 October 2013 18:55, Alex Bennée alex.ben...@linaro.org wrote:
  I think the problem is arm64 has been posted in several dependant patch
  sets hence working from a git tree. I think for now I'll take off the
  -Werror training wheels and see how far it gets.
 
 http://lists.nongnu.org/archive/html/qemu-devel/2013-09/msg04503.html
 
 is Alex's patchset which he rebased on to a qemu master,
 so if you can't wait for v2 then that's the one to look at.

Alex's patchset covers only the first part of the whole thing, and so 
misses some instructions and fixes.  If you merely want to use 
qemu-arm64 you're better off using the tree from me.

Alex B.: I'm building that tree with gcc 4.7 and this config:
% ../configure --disable-system --enable-linux-user --disable-werror \
  --static --disable-linux-aio --disable-strip \
  --target-list=arm64-linux-user
% make


Ciao,
Michael.

[Qemu-devel] Re: CPU type qemu64 breaks guest code

2011-03-22 Thread Michael Matz
Hi,

On Mon, 21 Mar 2011, Andre Przywara wrote:

   .name = qemu64,
   .level = 4,
   .vendor1 = CPUID_VENDOR_AMD_1,
   .vendor2 = CPUID_VENDOR_AMD_2,
   .vendor3 = CPUID_VENDOR_AMD_3,
   .family = 6,
   .model = 2,
   .stepping = 3,
  ...
 
 Well, yes, this is strange, and a different CPU model mimicking some really
 existing CPU would be better. But in my experience this opens up a can of
 worms, since a different family will trigger a lot of other nasty code that
 was silent before.

Mimicking as a really existing CPU would trigger exactly those code 
paths that are triggered when that same code is running on real hardware.
If such hypothetical problems were real they would occur for non-emulated 
hardware already.  But they don't.

 else if (strcmp (vendor_string, AuthenticAMD) == 0)
 switch (family)
   {
  case 5:
  case 6:
  abort ();
 
  The reasoning behind it is that for AMD, all 64-bit CPUs were family
  15.
 
 I guess that should be fixed there, as it is obviously nonsense.

Not really.  The above is for the x86_64 code paths, i.e. 64bit code.  
There never were, and there never will be, AMD processors capable of 
running 64bit code that have family 5 or 6.  The abort can't ever trigger 
on hardware or correctly emulated hardware.

 I haven't check what they actually need that family 6 does not provide, 
 if it is long mode, then this bit should be checked.

It's not about need.  It's about tuning and expectations.  gmp wants to 
tune itself according to the hardware it runs on, hence it switches on the 
vendor and family/model.  And in order not to have to handle combinations 
that can't exist in the real world (which seems sane to me) the aborts 
where put in place.  Now there's something to be said about being lenient 
in what you accept, but it's not wrong to be strict.

 I found that there is no valid combination for both Intel and AMD.

Of course not.  Why should there?  The FMS combination obviously can't 
exist independend of the (claimed) vendor.  Trying to go for one FMS that 
fits all is going to fail, how could it be different?


Ciao,
Michael.



Re: [Qemu-devel] [PATCH 02/40] elf: Add notes implementation

2010-11-02 Thread Michael Matz
Hi,

On Mon, 1 Nov 2010, Alexander Graf wrote:

  No. Pointers of same type which are not void pointers can be compared.
  
  There is even a data type ptrdiff_t, so you can also compare their
  difference with zero.
 
 Let's ask someone who definitely knows :).
 
 Michael, is code like
 
 char *x = a, *y = b;
 if (x  y) {
   ...
 }

Pointers can be compared iff they point into the same object 
(including right after the object), so it depends on what a and b were 
above.  This would be invalid for instance:

  int o1, o2;
  int *p1 = o1, *p2 = o2;
  if (p1  p2) ...

 valid? Or do I first have to cast x and y to unsigned longs or 
 uintptr_t?

For doing a valid pointer comparison you don't have to cast anything.  
Casting doesn't make an invalid comparison valid.


Ciao,
Michael.



Re: [Qemu-devel] Re: [PATCH 1/5] Fix i386 Host

2008-01-18 Thread Michael Matz
Hi,

On Fri, 18 Jan 2008, Johannes Schindelin wrote:

  Well, I can tell you why, but it doesn't help you: the 3.4.2 compiler 
  has different deficiencies in reload than the 4.x line of compilers.  
  To make the whole thingy work on all compilers trying and testing is 
  required to avoid all these different deficiencies.  My patch is 
  partly real bug fixes (the q constraint part for instance) and 
  partly changes helping to lessen the register pressure which reload 
  needs to fix (the %ecx thingy for instance).  Both of these might 
  expose the bugs in 3.4.2, in which case we need to hack around those 
  as well.  That's what I tried to do with the patch from 
  http://article.gmane.org/gmane.comp.emulators.qemu/22762 , but it 
  needs of course testing by someone who actually uses 3.4.2.
 
 I just downloaded it, and tested it on MinGW with 3.4.2, and I still get 
 the can't find register in class `Q_REGS´ while reloading `asm´ error.

Bummer.  As a test of theory (I'm not proposing this as patch), can you 
see what happens if you make vtmp volatile, i.e. make it look like so ? :

#if DATA_SIZE == 1 || DATA_SIZE == 2
volatile RES_TYPE vtmp = v;
#endif

Bah, remote work-arounding of reload sucks :-)


Ciao,
Michael.

Re: [Qemu-devel] Re: [PATCH 1/5] Fix i386 Host

2008-01-18 Thread Michael Matz
Hi,

On Fri, 18 Jan 2008, Johannes Schindelin wrote:

asm ( ... movzbl %b1, %%edx\n ...  : : r (blubb), r (bla) );
   
   Okay, but this only concerns gcc4, apparently.
  
  No, it's nothing to do with GCC.
 
 But apparently it has!  With gcc  4 I did never get the error.

As I tried to explain, this is pure luck.

 Which probably means that gcc  4 did _not_ use ecx, and therefore it 
 does not have to be pushed and popped.

We are talking about the hunk using the q constraint for operand 1 in 
st[bw]_kernel.  The change in the clobber list (and the associated 
saving/restoring of %ecx around the call) is something entirely different.

 Which -- judging from how commonly glue() is called in op.c -- could 
 mean a performance hit.

glue() is a macro, the function called is stw_kernel (inline function).

 I am all for supporting gcc  3, but please, please not at the cost of 
 having a performance hit for _existing_ users.

Have you measured this?  This function actually does a call to stw_mmu, a 
rather slow and big function, the overhead of one register store more or 
less is probably zero.

But that point is mood anyway.  When it works without the q constraint 
in gcc 3.4.2 it only does so, because GCC allocates one of the ax-dx 
registers to that operand (by luck, not by design).  As T1 is coming in in 
esi there anyway existed a reg-reg move already, so you pay that 
performance hit (if you like to call it such) already.

  Only if you want to trust your luck.  I fear I don't have gcc 3.4.2 
  lying around anywhere, so I can't really help debugging this reload 
  breakage in that GCC version.  It might help to introduce a temporary to 
  guide GCC through this problematic reload case by detaching the global 
  register variable from the asm operand.  For cases where it's no problem 
  this should be optimized away, so doesn't inhibit a performance cost.  
  What I mean is something like the below.  If someone with gcc 3.4.2 
  could test that ...
 
 I do ask myself how gcc would optimise away instructions that are 
 explicitely written in the asm() statement.  If it does so, I consider 
 this a serious bug in gcc.

My patch in the last mail introduces a copy in C (to vtmp), _that_ can be 
optimized away under the right circumstances.  Of course GCC does not 
change the asm template in any way.


Ciao,
Michael.




Re: [Qemu-devel] Re: [PATCH 1/5] Fix i386 Host

2008-01-18 Thread Michael Matz
Hi,

On Fri, 18 Jan 2008, Johannes Schindelin wrote:

   But apparently it has!  With gcc  4 I did never get the error.
  
  As I tried to explain, this is pure luck.
 
 Maybe.
 
   Which probably means that gcc  4 did _not_ use ecx, and therefore it 
   does not have to be pushed and popped.
  
  We are talking about the hunk using the q constraint for operand 1 in 
  st[bw]_kernel.  The change in the clobber list (and the associated 
  saving/restoring of %ecx around the call) is something entirely different.
 
 It cannot be, because just changing the clobber list makes the code 
 compile again!

But I'm not talking about the clobber list at all.  I reacted to the first 
mail forwarded to me, which was a question specifically about the hunk 
adding the q constraint, whose purpose I explained.  Are you now also 
asking about the changes regarding %ecx and the clobber list?


Ciao,
Michael.




Re: [Qemu-devel] Re: [PATCH 1/5] Fix i386 Host

2008-01-18 Thread Michael Matz
Hi,

On Fri, 18 Jan 2008, Johannes Schindelin wrote:

  But I'm not talking about the clobber list at all.  I reacted to the 
  first mail forwarded to me, which was a question specifically about the 
  hunk adding the q constraint, whose purpose I explained.  Are you now 
  also asking about the changes regarding %ecx and the clobber list?
 
 Okay, look, I do not have enough time to fix it myself.  But there _is_ a 
 breakage with gcc 3.4.2 on MinGW.  If that is not fixed, I will openly 
 oppose that patch going into CVS.

That's fine and is why I sent a test patch which might fix the 
breakage when applied on top of the constraint patch.

 All I tried was to get an understanding why the current patch Fix i386 
 (which is still misnamed) broke on MinGW.

Well, I can tell you why, but it doesn't help you: the 3.4.2 compiler has 
different deficiencies in reload than the 4.x line of compilers.  To make 
the whole thingy work on all compilers trying and testing is required to 
avoid all these different deficiencies.  My patch is partly real bug fixes 
(the q constraint part for instance) and partly changes helping to 
lessen the register pressure which reload needs to fix (the %ecx thingy 
for instance).  Both of these might expose the bugs in 3.4.2, in which 
case we need to hack around those as well.  That's what I tried to do with 
the patch from http://article.gmane.org/gmane.comp.emulators.qemu/22762 , 
but it needs of course testing by someone who actually uses 3.4.2.


Ciao,
Michael.




Re: [Qemu-devel] Re: [PATCH 1/5] Fix i386 Host

2008-01-18 Thread Michael Matz
Hi,

On Fri, 18 Jan 2008, Johannes Schindelin wrote:

   +#if DATA_SIZE == 1 || DATA_SIZE == 2
   +  q (v),
   +#else
 r (v),
   +#endif
 i ((CPU_TLB_SIZE - 1)  CPU_TLB_ENTRY_BITS),
 i (TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS),
 i (TARGET_PAGE_MASK | (DATA_SIZE - 1)),
   -- snap --
   
   Michael, Alexander, what is this hunk supposed to do?
  
  This is required to generate valid assembler code.  Without that hunk, the 
  interesting parts of the asm look like so (for DATA_SIZE == 1):
  
  asm ( ... movzbl %b1, %%edx\n ...  : : r (blubb), r (bla) );
 
 Okay, but this only concerns gcc4, apparently.

No, it's nothing to do with GCC.  The instruction itself (movzbl) requires 
an 8-bit register, so it must be made sure by the constraints the that 
operand indeed is one of those four.  If it also works with r then this 
is just pure luck (in that GCC chooses one of the four good registes, and 
not one of the three bad ones allowable with r).

 Can't we guard it with yet another defined(GCC...)?

Only if you want to trust your luck.  I fear I don't have gcc 3.4.2 lying 
around anywhere, so I can't really help debugging this reload breakage in 
that GCC version.  It might help to introduce a temporary to guide GCC 
through this problematic reload case by detaching the global register 
variable from the asm operand.  For cases where it's no problem this 
should be optimized away, so doesn't inhibit a performance cost.  What I 
mean is something like the below.  If someone with gcc 3.4.2 could test 
that ...


Ciao,
Michael.
-- 
--- softmmu_header.h.mm 2008-01-18 14:15:46.0 +0100
+++ softmmu_header.h2008-01-18 14:14:49.0 +0100
@@ -212,6 +212,9 @@ static inline int glue(glue(lds, SUFFIX)
 
 static inline void glue(glue(st, SUFFIX), MEMSUFFIX)(target_ulong ptr, 
RES_TYPE v)
 {
+#if DATA_SIZE == 1 || DATA_SIZE == 2
+RES_TYPE vtmp = v;
+#endif
 asm volatile (movl %0, %%edx\n
   movl %0, %%eax\n
   shrl %3, %%edx\n
@@ -253,7 +256,7 @@ static inline void glue(glue(st, SUFFIX)
 /* NOTE: 'q' would be needed as constraint, but we could not use it
with T1 ! */
 #if DATA_SIZE == 1 || DATA_SIZE == 2
- q (v),
+ q (vtmp),
 #else
   r (v), 
 #endif




Re: [Qemu-devel] [patch] make qemu work with GCC 4

2007-08-31 Thread Michael Matz
Hi,

On Thu, 30 Aug 2007, Thiemo Seufer wrote:

  * ppc: adds -fno-section-anchors to OP_CFLAGS, as dyngen isn't prepared
 to deal with the relocs resulting from using section anchors
 
 Maybe this should be handled more generally then, not ppc specific, like
 other offending compiler options: check if the compiler knows the
 option, if yes, disable the feature.

Yeah, like is done for the other additions to OP_CFLAGS, which are guarded 
by $(call cc-option, ...).

 I solved that by placing one of the T[012] operands into memory
 for HOST_I386, thereby freeing one reg.  Here's some justification 
 of why that doesn't really cost performance: with three free regs
 GCC is already spilling like mad in the snippets, we just trade one
 of those memory accesses (to stack) with one other mem access to 
 the cpu_state structure, which will be in cache.
 
 Could you back up this assumption with some numbers? :-)

I did that downthread.  To me the slowdown is not significant enough, 
although it exists.

  --- qemu-0.9.0.cvs.orig/target-arm/cpu.h2007-06-24 14:09:48.0 
  +0200
  +++ qemu-0.9.0.cvs/target-arm/cpu.h 2007-08-21 21:38:36.0 +0200
  @@ -52,6 +52,9 @@ typedef uint32_t ARMReadCPFunc(void *opa
*/
   
   typedef struct CPUARMState {
  +#if defined(HOST_I386)
  +uint32_t t1;
  +#endif
   /* Regs for current mode.  */
   uint32_t regs[16];
   /* Frequently accessed CPSR bits are stored separately for efficiently.
  diff -urp qemu-0.9.0.cvs.orig/target-arm/exec.h 
  qemu-0.9.0.cvs/target-arm/exec.h
  --- qemu-0.9.0.cvs.orig/target-arm/exec.h   2007-06-03 19:44:36.0 
  +0200
  +++ qemu-0.9.0.cvs/target-arm/exec.h2007-08-21 21:48:48.0 
  +0200
  @@ -23,7 +23,12 @@
   register struct CPUARMState *env asm(AREG0);
   register uint32_t T0 asm(AREG1);
   register uint32_t T1 asm(AREG2);
  +#ifndef HOST_I386
   register uint32_t T2 asm(AREG3);
  +#else
  +#define T2 (env-t1)
  +#endif
 
 T2/t1 mismatch, it seems. Likewise for mips and ppc.

Yes, well, I could have named the env member perhaps temp :-)  I can 
change it, though.


Ciao,
Michael.




Re: [Qemu-devel] [patch] make qemu work with GCC 4

2007-08-30 Thread Michael Matz
Hi,

On Wed, 29 Aug 2007, Michael Matz wrote:

  I solved that by placing one of the T[012] operands into memory
  for HOST_I386, thereby freeing one reg.  Here's some justification
  of why that doesn't really cost performance: with three free regs
  GCC is already spilling like mad in the snippets, we just trade one
  of those memory accesses (to stack) with one other mem access to
  the cpu_state structure, which will be in cache.
  
  Do you have any evidence to support this claim?
 
 Not really, only an apple and orange comparison.  A 1 iteration 
 tests/sha1 run in the same Linux image, with -no-kqemu, on host and target 
 i386:  time ./sha1
 
 with qemu-0.8.2 (compiled by gcc 3.3-hammer): 7.92 seconds
 with qemu-0.9.0-cvs (gcc4.1 compiled, with the patch): 8.15 seconds
 
 I'll try to get a better comparison.

So, I've now compared our 0.9.0 package, once without patch compiled by 
3.3-hammer, and once with patch and compiled by gcc 4.2:

gcc33 compiled: 7.81 seconds (i.e. a bit faster than 0.8.2 was)
gcc42 compiled: 8.07 seconds

I.e. 3% slower.


Ciao,
Michael.




Re: [Qemu-devel] [patch] make qemu work with GCC 4

2007-08-29 Thread Michael Matz
Hi,

On Wed, 29 Aug 2007, Johannes Schindelin wrote:

  The whole patch is against a 0.9.0-cvs version from 2007-07-09 (Alex 
  might know the exact checkout date), so chances are that it still 
  applies :)
 
 It is based on the z80 fork, but it applies relatively cleanly (one 
 trailing whitespace) to the version as of Use unsigned 32-bit load for 
 ld/lduw.
 
 However, I still get this error:
 
 ../dyngen -o op.h op.o
 dyngen: ret or jmp expected at the end of op_tadd_T1_T0_ccTV
 make[1]: *** [op.h] Fehler 1
 make[1]: Leaving directory `/home/me/qemu/sparc-linux-user'

Using SuSE 10.2, i.e. gcc 4.1.2?  For qemu CVS (see my other mail) I 
didn't test linux-user at all.  In our qemu package itself (which builds 
also the linux-user parts) there are some more patches which might fix the 
above problem, don't know yet.  I would try --disable-linux-user in your 
case.  I can look at it later.

 
 When only making i386-softmmu, I still get this (on SuSE 10.2):
 
 In file included from /home/gene099/my/qemu/usb-linux.c:29:
 /usr/include/linux/usbdevice_fs.h:49: error: expected ‘:’, ‘,’, ‘;’, ‘}’ 
 or ‘__attribute__’ before ‘*’ token
 /usr/include/linux/usbdevice_fs.h:56: error: expected ‘:’, ‘,’, ‘;’, ‘}’ 
 or ‘__attribute__’ before ‘*’ token
 /usr/include/linux/usbdevice_fs.h:66: error: expected ‘:’, ‘,’, ‘;’, ‘}’ 
 or ‘__attribute__’ before ‘*’ token
 /usr/include/linux/usbdevice_fs.h:100: error: expected ‘:’, ‘,’, ‘;’, ‘}’ 
 or ‘__attribute__’ before ‘*’ token
 /usr/include/linux/usbdevice_fs.h:116: error: expected ‘:’, ‘,’, ‘;’, ‘}’ 
 or ‘__attribute__’ before ‘*’ token
 /home/me/qemu/usb-linux.c: In function ‘usb_host_handle_data’:
 /home/me/qemu/usb-linux.c:130: error: ‘struct 
 usbdevfs_bulktransfer’ has no member named ‘data’
 make: *** [usb-linux.o] Fehler 1

Yes, that's a problem of the kernel headers on 10.2.  You can work around 
this with the below snippet.


Ciao,
Michael.
-- 
Index: usb-linux.c
===
RCS file: /sources/qemu/qemu/usb-linux.c,v
retrieving revision 1.10
diff -u -p -r1.10 usb-linux.c
--- usb-linux.c 10 Dec 2006 22:11:04 -  1.10
+++ usb-linux.c 29 Aug 2007 11:45:13 -
@@ -26,6 +26,7 @@
 #if defined(__linux__)
 #include dirent.h
 #include sys/ioctl.h
+#define __user
 #include linux/usbdevice_fs.h
 #include linux/version.h



Re: [Qemu-devel] [patch] make qemu work with GCC 4

2007-08-29 Thread Michael Matz
Hi,

On Wed, 29 Aug 2007, Johannes Schindelin wrote:

  Thanks for your effor Michael! Now, I only hope, one of the patches 
  that makes qemu gcc4 compliant are soon merged.
 
 Well, to throw a spanner in the works: this patch is the 4th patch along 
 the lines that I came about.  None of them (AFAICT) was tested well 
 enough to be included in CVS.  Indeed, the biggest problem seems to be 
 to make a patch that not only works on the machine of the poster, but on 
 other machines, too.

Please?  qemu with that patch builds on four architectures, and tests fine 
on two of them (including cross targets) and only because I haven't run it 
on the other two architectures yet.  It might or might not need more 
patches than just mine (as I started from our package), but in that case 
they are independend of making qemu work with gcc 4, which is the only 
thing my patch is concerned about.


Ciao,
Michael.




Re: [Qemu-devel] [patch] make qemu work with GCC 4

2007-08-29 Thread Michael Matz
Hi,

On Wed, 29 Aug 2007, Paul Brook wrote:

 I solved that by placing one of the T[012] operands into memory
 for HOST_I386, thereby freeing one reg.  Here's some justification
 of why that doesn't really cost performance: with three free regs
 GCC is already spilling like mad in the snippets, we just trade one
 of those memory accesses (to stack) with one other mem access to
 the cpu_state structure, which will be in cache.
 
 Do you have any evidence to support this claim?

Not really, only an apple and orange comparison.  A 1 iteration 
tests/sha1 run in the same Linux image, with -no-kqemu, on host and target 
i386:  time ./sha1

with qemu-0.8.2 (compiled by gcc 3.3-hammer): 7.92 seconds
with qemu-0.9.0-cvs (gcc4.1 compiled, with the patch): 8.15 seconds

I'll try to get a better comparison.  Note, though, that this is the only 
easy solution.  Any other solution would either involve improving reload 
pretty much, or rewriting many of the op.c patterns (for all targets) to 
never require more than three registers, and ensure that gcc doesn't 
become clever and mashes some insns together again (and in trying to do 
that probably slow down the snippets again).  Or doing away with dyngen at 
all.  All three solutions are fairly involved, so I'm personally fine with 
the above slowdown (for i386 targets you won't normally get any noticable 
slowdown as you'd use kqemu).

 Last time I did this it caused a significant performance hit. I'd guess 
 that most common ops are simple enough that we don't need more than 3 
 registers.

For i386 target I'm redefining only T1 (as I figured that A0 and T0 are 
used a bit more), it might be that some of the code could be generated in 
a way to not make use of T1 too much, I haven't really poked at that.

  --- qemu-0.9.0.cvs.orig/softmmu_header.h
  -                  : %eax, %ecx, %edx, memory, cc);
  +                  : %eax, %edx, memory, cc);
 
 This change is wrong. The inline asm calls C code which clobbers %ecx.

Indeed, must have been blind.  Okay these are too many clobbers for poor 
gcc4 nevertheless, so a push/pop %ecx around the call needs to be done.  
Courious that this didn't lead to fire all over the place.  See patch 
below.


Ciao,
Michael.

Index: softmmu_header.h
===
RCS file: /sources/qemu/qemu/softmmu_header.h,v
retrieving revision 1.15
diff -u -p -r1.15 softmmu_header.h
--- softmmu_header.h23 May 2007 19:58:10 -  1.15
+++ softmmu_header.h29 Aug 2007 17:27:37 -
@@ -230,9 +230,11 @@ static inline void glue(glue(st, SUFFIX)
 #else
 #error unsupported size
 #endif
+ pushl %%ecx\n
   pushl %6\n
   call %7\n
   popl %%eax\n
+  popl %%ecx\n
   jmp 2f\n
   1:\n
   addl 8(%%edx), %%eax\n
@@ -250,14 +252,18 @@ static inline void glue(glue(st, SUFFIX)
   : r (ptr), 
 /* NOTE: 'q' would be needed as constraint, but we could not use it
with T1 ! */
+#if DATA_SIZE == 1 || DATA_SIZE == 2
+ q (v),
+#else
   r (v), 
+#endif
   i ((CPU_TLB_SIZE - 1)  CPU_TLB_ENTRY_BITS), 
   i (TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS), 
   i (TARGET_PAGE_MASK | (DATA_SIZE - 1)),
   m (*(uint32_t *)offsetof(CPUState, 
tlb_table[CPU_MEM_INDEX][0].addr_write)),
   i (CPU_MEM_INDEX),
   m (*(uint8_t *)glue(glue(__st, SUFFIX), MMUSUFFIX))
-  : %eax, %ecx, %edx, memory, cc);
+  : %eax, %edx, memory, cc);
 }
 
 #else

[Qemu-devel] [patch] make qemu work with GCC 4

2007-08-28 Thread Michael Matz
Hi,

[please keep me CCed, I'm not on this list]

the below patch let's qemu be compiled by GCC 4.2 (probably also 4.1 and 
others) for most hosts (i386,x86_64,ia64,ppc).  s390 as host is missing, 
and needs a compiler change to emit the literal store inline again, as the 
literal pool at the end fundamentally breaks the assumption that qemu can 
paste together the code snippets by patching out the return.  I have no 
HOST_{ARM,MIPS*,ALPHA,SPARC*,M68K} machines to compile for that.

It specifically changes these things:

* ppc: adds -fno-section-anchors to OP_CFLAGS, as dyngen isn't prepared
   to deal with the relocs resulting from using section anchors
* ppc: on target-alpha op_reset_FT GCC4 uses a floating point constant 0.0
   to reset the ft regs, which in turn is loaded from the data 
   section.  The reloc for that is unhandled.  Using -ffast-math would 
   work around this, but I chose to be conservative and change only
   the op.c snippet in question.  See the comment there.
* i386: well, most of you will know that GCC4 doesn't compile qemu because 
   of reload.  The inherent problem is, that qemu uses 64bit
   entities in some places (sometimes structs), which GCC (4.x) 
   manages to place in registers, i.e. needs 2 hardregs.  But it 
   sometimes just so happens that an instruction needing such DImode
   reg also has a memory operand with an indexed address (reg plus 
   reg), hence two hardregs more.  But qemu by default leaves just 
   three free registers for compiling op.c -- boom.  This is somewhat 
   hard to work around in GCC (trust me :) ).

   I solved that by placing one of the T[012] operands into memory
   for HOST_I386, thereby freeing one reg.  Here's some justification 
   of why that doesn't really cost performance: with three free regs
   GCC is already spilling like mad in the snippets, we just trade one
   of those memory accesses (to stack) with one other mem access to 
   the cpu_state structure, which will be in cache.

   Additionally I made sure that I put the least used Tx global into 
   memory.  I haven't done much performance measurements but noticed 
   no glaring problems.

   Two more obvious changes in an inline asm in softmmu_header.h were 
   necessary too.

A qemu with that patch was tested on HOST_I386 with all images from the 
qemu homepage (i.e. arm, mips, mipsel, coldfire (that doesn't work) and 
sparc), some i386 boot isos and a freedos image.  It also was tested on 
HOST_X86_64 installing openSUSE beta-something for i386 and x86_64.  I 
haven't yet tested it on ia64 or ppc hosts.

The whole patch is against a 0.9.0-cvs version from 2007-07-09 (Alex might 
know the exact checkout date), so chances are that it still applies :)


Ciao,
Michael.

diff -urp qemu-0.9.0.cvs.orig/softmmu_header.h qemu-0.9.0.cvs/softmmu_header.h
--- qemu-0.9.0.cvs.orig/softmmu_header.h2007-08-21 18:58:00.0 
+0200
+++ qemu-0.9.0.cvs/softmmu_header.h 2007-08-21 20:40:23.0 +0200
@@ -254,14 +254,18 @@ static inline void glue(glue(st, SUFFIX)
   : r (ptr), 
 /* NOTE: 'q' would be needed as constraint, but we could not use it
with T1 ! */
+#if DATA_SIZE == 1 || DATA_SIZE == 2
+ q (v),
+#else
   r (v), 
+#endif
   i ((CPU_TLB_SIZE - 1)  CPU_TLB_ENTRY_BITS), 
   i (TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS), 
   i (TARGET_PAGE_MASK | (DATA_SIZE - 1)),
   m (*(uint32_t *)offsetof(CPUState, 
tlb_table[CPU_MEM_INDEX][0].addr_write)),
   i (CPU_MEM_INDEX),
   m (*(uint8_t *)glue(glue(__st, SUFFIX), MMUSUFFIX))
-  : %eax, %ecx, %edx, memory, cc);
+  : %eax, %edx, memory, cc);
 }
 
 #else
diff -urp qemu-0.9.0.cvs.orig/target-alpha/cpu.h 
qemu-0.9.0.cvs/target-alpha/cpu.h
--- qemu-0.9.0.cvs.orig/target-alpha/cpu.h  2007-06-03 23:02:37.0 
+0200
+++ qemu-0.9.0.cvs/target-alpha/cpu.h   2007-08-22 00:54:15.0 +0200
@@ -278,6 +278,8 @@ struct CPUAlphaState {
  * used to emulate 64 bits target on 32 bits hosts
  */ 
 target_ulong t0, t1, t2;
+#elif defined(HOST_I386)
+target_ulong t2;
 #endif
 /* */
 double ft0, ft1, ft2;
diff -urp qemu-0.9.0.cvs.orig/target-alpha/exec.h 
qemu-0.9.0.cvs/target-alpha/exec.h
--- qemu-0.9.0.cvs.orig/target-alpha/exec.h 2007-06-03 19:44:36.0 
+0200
+++ qemu-0.9.0.cvs/target-alpha/exec.h  2007-08-21 21:28:58.0 +0200
@@ -40,7 +40,11 @@ register struct CPUAlphaState *env asm(A
 
 register uint64_t T0 asm(AREG1);
 register uint64_t T1 asm(AREG2);
+#ifndef HOST_I386
 register uint64_t T2 asm(AREG3);
+#else
+#define T2 (env-t2)
+#endif
 
 #endif /* TARGET_LONG_BITS  HOST_LONG_BITS */
 
diff -urp qemu-0.9.0.cvs.orig/target-arm/cpu.h qemu-0.9.0.cvs/target-arm/cpu.h
--- qemu-0.9.0.cvs.orig/target-arm/cpu.h