Re: [Qemu-devel] [PATCH v2 02/12] ARM: Prepare translation for AArch64 code

2013-05-01 Thread Laurent Desnogues
On Wednesday, May 1, 2013, Richard Henderson r...@twiddle.net wrote:
 On 2013-04-30 07:36, John Rigby wrote:

   uint32_t regs[16];
 +
 +/* Regs for A64 mode.  */
 +uint64_t xregs[31];
 +uint64_t pc;
 +uint64_t sp;
 +uint32_t pstate;
 +uint32_t aarch64_state; /* 1 if CPU is in aarch64 state */
 +

 How do these registers overlap (or not) in real hardware?
 Is it possible to union these with the 32-bit state?

There is an overlap between 32- and 64-bit state registers,
but it's against the set of 32 (?) 32-bit registers that exist
across all modes, not against the 16 registers as used here.

IMHO it doesn't make sense to union them, the mapping
can be done when switching from 32- to 64-bit modes.


Laurent


Re: [Qemu-devel] [PATCH v2 03/12] ARM: Add AArch64 translation stub

2013-04-30 Thread Laurent Desnogues
On Tue, Apr 30, 2013 at 8:36 AM, John Rigby john.ri...@linaro.org wrote:
 From: Alexander Graf ag...@suse.de

 We should translate AArch64 mode separately from AArch32 mode. In AArch64 
 mode,
 registers look vastly different, instruction encoding is completely different,
 basically the system turns into a different machine.

 So let's do a simple if() in translate.c to decide whether we can handle the
 current code in the legacy AArch32 code or in the new AArch64 code.

 So far, the translation always complains about unallocated instructions. There
 is no emulator functionality in this patch!

 Signed-off-by: Alexander Graf ag...@suse.de
 ---
 Changes in v2:
 - Remove uses of is_a64 that are not needed because arch choice happens at
 a higher level.
 - aarch64 register arrays now only have 31 entries with sp and xzr treated as
 special cases.

 +if (is_a64(env)) {
  target-arm/Makefile.objs   |1 +
  target-arm/translate-a64.c |  139 
 
  target-arm/translate.c |9 +++
  target-arm/translate.h |6 ++
  4 files changed, 155 insertions(+)
  create mode 100644 target-arm/translate-a64.c

 diff --git a/target-arm/Makefile.objs b/target-arm/Makefile.objs
 index d89b57c..e488edb 100644
 --- a/target-arm/Makefile.objs
 +++ b/target-arm/Makefile.objs
 @@ -3,3 +3,4 @@ obj-$(CONFIG_SOFTMMU) += machine.o
  obj-$(CONFIG_KVM) += kvm.o
  obj-y += translate.o op_helper.o helper.o cpu.o
  obj-y += neon_helper.o iwmmxt_helper.o
 +obj-$(TARGET_AARCH64) += translate-a64.o
 diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
 new file mode 100644
 index 000..961792d
 --- /dev/null
 +++ b/target-arm/translate-a64.c
 @@ -0,0 +1,139 @@
 +/*
 + *  AArch64 translation
 + *
 + *  Copyright (c) 2013 Alexander Graf ag...@suse.de
 + *
 + * This library is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2 of the License, or (at your option) any later version.
 + *
 + * This library is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with this library; if not, see 
 http://www.gnu.org/licenses/.
 + */
 +#include stdarg.h
 +#include stdlib.h
 +#include stdio.h
 +#include string.h
 +#include inttypes.h
 +
 +#include cpu.h
 +#include tcg-op.h
 +#include qemu/log.h
 +#include translate.h
 +#include qemu/host-utils.h
 +
 +#include helper.h
 +#define GEN_HELPER 1
 +#include helper.h
 +
 +static TCGv_i64 cpu_X[31];
 +static TCGv_i64 cpu_pc;
 +static TCGv_i64 cpu_sp;
 +static TCGv_i32 pstate;
 +
 +static const char *regnames[] =
 +{ x0, x1, x2, x3, x4, x5, x6, x7,
 +  x8, x9, x10, x11, x12, x13, x14, x15,
 +  x16, x17, x18, x19, x20, x21, x22, x23,
 +  x24, x25, x26, x27, x28, x29, lr };
 +
 +/* initialize TCG globals.  */
 +void a64_translate_init(void)
 +{
 +int i;
 +
 +cpu_pc = tcg_global_mem_new_i64(TCG_AREG0,
 +offsetof(CPUARMState, pc),
 +pc);
 +cpu_sp = tcg_global_mem_new_i64(TCG_AREG0,
 +offsetof(CPUARMState, sp),
 +sp);
 +for (i = 0; i  31; i++) {
 +cpu_X[i] = tcg_global_mem_new_i64(TCG_AREG0,
 +  offsetof(CPUARMState, xregs[i]),
 +  regnames[i]);
 +}
 +
 +pstate = tcg_global_mem_new_i32(TCG_AREG0,
 +offsetof(CPUARMState, pstate),
 +pstate);
 +}
 +
 +void cpu_dump_state_a64(CPUARMState *env, FILE *f, fprintf_function 
 cpu_fprintf,
 +int flags)
 +{
 +int i;
 +
 +cpu_fprintf(f, PC=%016PRIx64  SP=%016PRIx64\n, env-pc, env-sp);
 +for(i = 0; i  31; i++) {
 +cpu_fprintf(f, X%02d=%016PRIx64, i, env-xregs[i]);
 +if ((i % 4) == 3)
 +cpu_fprintf(f, \n);
 +else
 +cpu_fprintf(f,  );
 +}
 +cpu_fprintf(f, XZR=%016PRIx64\n, env-xregs[31]);

I guess you want to print XSP so the format string is wrong
and the register is wrong too, it should be env-sp.

OTOH I would have preferred xregs[31] to be sp :-)


Laurent

 +cpu_fprintf(f, PSTATE=%c%c%c%c\n,
 +env-pstate  PSTATE_N ? 'n' : '.',
 +env-pstate  PSTATE_Z ? 'z' : '.',
 +env-pstate  PSTATE_C ? 'c' : '.',
 +env-pstate  PSTATE_V ? 'v' : '.');
 +cpu_fprintf(f, \n);
 +}
 +
 +void gen_a64_set_pc_im(uint64_t val)
 +{
 +tcg_gen_movi_i64(cpu_pc, val);
 +}
 +
 +static void gen_exception(int excp)
 +{
 +TCGv_i32 tmp = 

Re: [Qemu-devel] Latest QEMU compilation fails

2013-04-09 Thread Laurent Desnogues
On Tue, Apr 9, 2013 at 11:07 AM, Michal Novotny minov...@redhat.com wrote:
 Hi,
 I've just pulled the latest git repository of QEMU but it's failing the
 compilation on missing virtio-9p-device.h header file:
   CCx86_64-softmmu/target-i386/arch_memory_mapping.o
   CCx86_64-softmmu/target-i386/arch_dump.o
   CCx86_64-softmmu/target-i386/kvm.o
   CCx86_64-softmmu/target-i386/hyperv.o
   CCx86_64-softmmu/hw/9pfs/virtio-9p-device.o
   CCx86_64-softmmu/hw/block/virtio-blk.o
   CCx86_64-softmmu/hw/char/virtio-serial-bus.o
   CCx86_64-softmmu/hw/9pfs/virtio-9p-device.o
 /home/mig/Work/kvm-downstream/git/qemu/hw/9pfs/virtio-9p-device.c:19:30:
 fatal error: virtio-9p-device.h: No such file or directory
 compilation terminated.
 make[1]: *** [hw/9pfs/virtio-9p-device.o] Error 1
 make: *** [subdir-x86_64-softmmu] Error 2

 Does it do for others as well or is it just a local issue?

Same here (47b5264eb3e1cd2825e48d28fd0d1b239ed53974).
The missing file used to be under hw/9pfs/.


Laurent

 Thanks,
 Michal

 --
 Michal Novotny minov...@redhat.com, RHCE, Red Hat
 Virtualization | libvirt-php bindings | php-virt-control.org





Re: [Qemu-devel] Finding first TranslationBlock in user mode emulation

2013-04-05 Thread Laurent Desnogues
On Fri, Apr 5, 2013 at 4:15 PM, Peter Maydell peter.mayd...@linaro.org wrote:
 On 5 April 2013 15:06, Ivan Jovanovic jovanovic.d.i...@gmail.com wrote:
 When I examine in_asm and out_asm logs I notice that before my program
 starts emulating, dynamic linker code is being emulated, which is, of
 course, expected.

 My question is, is there a way in qemu (a flag or something similar) to
 determine during emulation when the dl code finishes executing and execution
 of the first TranslationBlock of my code starts?

 No, because QEMU doesn't care what your program does -- it just
 emulates code and doesn't need to know whether it is in the
 dynamic linker or the main program binary or a shared library
 or even code your program has dynamically generated at runtime.

 I suggest you look at the symbols for your guest binary (with
 'nm' or 'objdump', for example) and match up the addresses in
 them with your code. (In particular this should let you find out
 which TB corresponds to the main() function.) You can either do
 this by hand or you could probably write a script to annotate
 the logs for you.

There's an easier way that will work even with stripped execs:
use the ELF entry point.

readelf -l prog | grep -i entry

HTH,

Laurent



Re: [Qemu-devel] [PATCH v3 05/20] tcg-arm: Handle constant arguments to add2/sub2

2013-03-28 Thread Laurent Desnogues
On Thu, Mar 28, 2013 at 5:04 PM, Richard Henderson r...@twiddle.net wrote:
 On 03/28/2013 08:56 AM, Peter Maydell wrote:
 +#define TO_CPSR (1  20)

 This is the S bit; I think it would be helpful if our #define
 had a name that made that clearer...

 Suggestions?  I thought TO_CPSR was clear...

SET_FLAGS?

 +ARITH_TST = 0x8  21 | TO_CPSR,
 +ARITH_CMP = 0xa  21 | TO_CPSR,
 +ARITH_CMN = 0xb  21 | TO_CPSR,
 +ARITH_ORR = 0xc  21,
 +ARITH_MOV = 0xd  21,
 +ARITH_BIC = 0xe  21,
 +ARITH_MVN = 0xf  21,
  };

 It feels a little ugly to OR in the S bit in this enum, but I guess
 it works.

 I think it actually makes everything a lot cleaner myself.  Especially given
 the previous definition that required runtime checks for TST et al.

I also find this cleaner.  OTOH I understand that Peter finds it
odd given that other operations (such as ADD) can optionally
set flags.


Laurent



Re: [Qemu-devel] qemu-x86_64 on i386 host: SIGSEGV

2013-03-25 Thread Laurent Desnogues
On Mon, Mar 25, 2013 at 4:03 PM, Richard Henderson r...@twiddle.net wrote:
 On 03/24/2013 03:59 AM, Peter Maydell wrote:
 PC is FF600400 so either we've messed it up already or this
 is just 64 bit address space doesn't fit in a 32 bit one.

 This is probably the fallback vdso address.

Yes, it looks like __NR_vtime.


Laurent

 I've previously sent patches to the list (several times) to add a real
 vdso to qemu for x86_64, so that the glibc will do the right thing, but
 the patches never got reviewed or applied.

 I could revive them if someone commits to reviewing them this time.


 r~




Re: [Qemu-devel] [PATCH 11/20] qemu-char: use a glib timeout instead of qemu-timer

2013-03-15 Thread Laurent Desnogues
Hello,

On Tue, Mar 5, 2013 at 6:51 PM, Amit Shah amit.s...@redhat.com wrote:
 From: Anthony Liguori aligu...@us.ibm.com

 Signed-off-by: Anthony Liguori aligu...@us.ibm.com
 Signed-off-by: Amit Shah amit.s...@redhat.com
 ---
  qemu-char.c | 68 
 -
  1 file changed, 45 insertions(+), 23 deletions(-)

 diff --git a/qemu-char.c b/qemu-char.c
 index eb0ac81..6dba943 100644
 --- a/qemu-char.c
 +++ b/qemu-char.c
 @@ -990,12 +990,50 @@ typedef struct {
  int connected;
  int polling;
  int read_bytes;
 -QEMUTimer *timer;
 +guint timer_tag;
  } PtyCharDriver;

  static void pty_chr_update_read_handler(CharDriverState *chr);
  static void pty_chr_state(CharDriverState *chr, int connected);

 +static gboolean pty_chr_timer(gpointer opaque)
 +{
 +struct CharDriverState *chr = opaque;
 +PtyCharDriver *s = chr-opaque;
 +
 +if (s-connected) {
 +goto out;
 +}
 +if (s-polling) {
 +/* If we arrive here without polling being cleared due
 + * read returning -EIO, then we are (re-)connected */
 +pty_chr_state(chr, 1);
 +goto out;
 +}
 +
 +/* Next poll ... */
 +pty_chr_update_read_handler(chr);
 +
 +out:
 +return FALSE;
 +}
 +
 +static void pty_chr_rearm_timer(CharDriverState *chr, int ms)
 +{
 +PtyCharDriver *s = chr-opaque;
 +
 +if (s-timer_tag) {
 +g_source_remove(s-timer_tag);
 +s-timer_tag = 0;
 +}
 +
 +if (ms == 1000) {
 +s-timer_tag = g_timeout_add_seconds(1, pty_chr_timer, chr);

It looks like g_timeout_add_seconds isn't available for
poor people using some old distros (glib 2.12.3 here).

Thanks,

Laurent

 +} else {
 +s-timer_tag = g_timeout_add(ms, pty_chr_timer, chr);
 +}
 +}
 +
  static int pty_chr_write(CharDriverState *chr, const uint8_t *buf, int len)
  {
  PtyCharDriver *s = chr-opaque;
 @@ -1065,7 +1103,7 @@ static void pty_chr_update_read_handler(CharDriverState 
 *chr)
   * timeout to the normal (much longer) poll interval before the
   * timer triggers.
   */
 -qemu_mod_timer(s-timer, qemu_get_clock_ms(rt_clock) + 10);
 +pty_chr_rearm_timer(chr, 10);
  }

  static void pty_chr_state(CharDriverState *chr, int connected)
 @@ -1080,7 +1118,7 @@ static void pty_chr_state(CharDriverState *chr, int 
 connected)
  /* (re-)connect poll interval for idle guests: once per second.
   * We check more frequently in case the guests sends data to
   * the virtual device linked to our pty. */
 -qemu_mod_timer(s-timer, qemu_get_clock_ms(rt_clock) + 1000);
 +pty_chr_rearm_timer(chr, 1000);
  } else {
  if (!s-connected)
  qemu_chr_generic_open(chr);
 @@ -1088,23 +1126,6 @@ static void pty_chr_state(CharDriverState *chr, int 
 connected)
  }
  }

 -static void pty_chr_timer(void *opaque)
 -{
 -struct CharDriverState *chr = opaque;
 -PtyCharDriver *s = chr-opaque;
 -
 -if (s-connected)
 -return;
 -if (s-polling) {
 -/* If we arrive here without polling being cleared due
 - * read returning -EIO, then we are (re-)connected */
 -pty_chr_state(chr, 1);
 -return;
 -}
 -
 -/* Next poll ... */
 -pty_chr_update_read_handler(chr);
 -}

  static void pty_chr_close(struct CharDriverState *chr)
  {
 @@ -1117,8 +1138,9 @@ static void pty_chr_close(struct CharDriverState *chr)
  fd = g_io_channel_unix_get_fd(s-fd);
  g_io_channel_unref(s-fd);
  close(fd);
 -qemu_del_timer(s-timer);
 -qemu_free_timer(s-timer);
 +if (s-timer_tag) {
 +g_source_remove(s-timer_tag);
 +}
  g_free(s);
  qemu_chr_be_event(chr, CHR_EVENT_CLOSED);
  }
 @@ -1170,7 +1192,7 @@ static CharDriverState *qemu_chr_open_pty(QemuOpts 
 *opts)
  chr-chr_add_watch = pty_chr_add_watch;

  s-fd = io_channel_from_fd(master_fd);
 -s-timer = qemu_new_timer_ms(rt_clock, pty_chr_timer, chr);
 +s-timer_tag = 0;

  return chr;
  }
 --
 1.8.1.2





Re: [Qemu-devel] [PATCH 11/20] qemu-char: use a glib timeout instead of qemu-timer

2013-03-15 Thread Laurent Desnogues
On Fri, Mar 15, 2013 at 4:44 PM, Anthony Liguori aligu...@us.ibm.com wrote:
 Laurent Desnogues laurent.desnog...@gmail.com writes:

 Hello,

 On Tue, Mar 5, 2013 at 6:51 PM, Amit Shah amit.s...@redhat.com wrote:
 From: Anthony Liguori aligu...@us.ibm.com

 Signed-off-by: Anthony Liguori aligu...@us.ibm.com
 Signed-off-by: Amit Shah amit.s...@redhat.com
 ---
  qemu-char.c | 68 
 -
  1 file changed, 45 insertions(+), 23 deletions(-)

 diff --git a/qemu-char.c b/qemu-char.c
 index eb0ac81..6dba943 100644
 --- a/qemu-char.c
 +++ b/qemu-char.c
 @@ -990,12 +990,50 @@ typedef struct {
  int connected;
  int polling;
  int read_bytes;
 -QEMUTimer *timer;
 +guint timer_tag;
  } PtyCharDriver;

  static void pty_chr_update_read_handler(CharDriverState *chr);
  static void pty_chr_state(CharDriverState *chr, int connected);

 +static gboolean pty_chr_timer(gpointer opaque)
 +{
 +struct CharDriverState *chr = opaque;
 +PtyCharDriver *s = chr-opaque;
 +
 +if (s-connected) {
 +goto out;
 +}
 +if (s-polling) {
 +/* If we arrive here without polling being cleared due
 + * read returning -EIO, then we are (re-)connected */
 +pty_chr_state(chr, 1);
 +goto out;
 +}
 +
 +/* Next poll ... */
 +pty_chr_update_read_handler(chr);
 +
 +out:
 +return FALSE;
 +}
 +
 +static void pty_chr_rearm_timer(CharDriverState *chr, int ms)
 +{
 +PtyCharDriver *s = chr-opaque;
 +
 +if (s-timer_tag) {
 +g_source_remove(s-timer_tag);
 +s-timer_tag = 0;
 +}
 +
 +if (ms == 1000) {
 +s-timer_tag = g_timeout_add_seconds(1, pty_chr_timer, chr);

 It looks like g_timeout_add_seconds isn't available for
 poor people using some old distros (glib 2.12.3 here).

 Can you test adding:

 #if !GLIB_CHECK_VERSION(2, 14, 0)
 static guint g_timeout_add_seconds(guint interval, GSourceFunc function,
gpointer data)
 {
 return g_timeout_add(interval * 1000, function, data);
 }
 #endif

That works fine, thanks for looking!

 We probably should introduce a glib-compat to centralize work arounds
 for older versions of glib...

Agreed.

Thanks again,

Laurent

 Regards,

 Anthony Liguori


 Thanks,

 Laurent

 +} else {
 +s-timer_tag = g_timeout_add(ms, pty_chr_timer, chr);
 +}
 +}
 +
  static int pty_chr_write(CharDriverState *chr, const uint8_t *buf, int len)
  {
  PtyCharDriver *s = chr-opaque;
 @@ -1065,7 +1103,7 @@ static void 
 pty_chr_update_read_handler(CharDriverState *chr)
   * timeout to the normal (much longer) poll interval before the
   * timer triggers.
   */
 -qemu_mod_timer(s-timer, qemu_get_clock_ms(rt_clock) + 10);
 +pty_chr_rearm_timer(chr, 10);
  }

  static void pty_chr_state(CharDriverState *chr, int connected)
 @@ -1080,7 +1118,7 @@ static void pty_chr_state(CharDriverState *chr, int 
 connected)
  /* (re-)connect poll interval for idle guests: once per second.
   * We check more frequently in case the guests sends data to
   * the virtual device linked to our pty. */
 -qemu_mod_timer(s-timer, qemu_get_clock_ms(rt_clock) + 1000);
 +pty_chr_rearm_timer(chr, 1000);
  } else {
  if (!s-connected)
  qemu_chr_generic_open(chr);
 @@ -1088,23 +1126,6 @@ static void pty_chr_state(CharDriverState *chr, int 
 connected)
  }
  }

 -static void pty_chr_timer(void *opaque)
 -{
 -struct CharDriverState *chr = opaque;
 -PtyCharDriver *s = chr-opaque;
 -
 -if (s-connected)
 -return;
 -if (s-polling) {
 -/* If we arrive here without polling being cleared due
 - * read returning -EIO, then we are (re-)connected */
 -pty_chr_state(chr, 1);
 -return;
 -}
 -
 -/* Next poll ... */
 -pty_chr_update_read_handler(chr);
 -}

  static void pty_chr_close(struct CharDriverState *chr)
  {
 @@ -1117,8 +1138,9 @@ static void pty_chr_close(struct CharDriverState *chr)
  fd = g_io_channel_unix_get_fd(s-fd);
  g_io_channel_unref(s-fd);
  close(fd);
 -qemu_del_timer(s-timer);
 -qemu_free_timer(s-timer);
 +if (s-timer_tag) {
 +g_source_remove(s-timer_tag);
 +}
  g_free(s);
  qemu_chr_be_event(chr, CHR_EVENT_CLOSED);
  }
 @@ -1170,7 +1192,7 @@ static CharDriverState *qemu_chr_open_pty(QemuOpts 
 *opts)
  chr-chr_add_watch = pty_chr_add_watch;

  s-fd = io_channel_from_fd(master_fd);
 -s-timer = qemu_new_timer_ms(rt_clock, pty_chr_timer, chr);
 +s-timer_tag = 0;

  return chr;
  }
 --
 1.8.1.2






Re: [Qemu-devel] About the Thread model in Qemu【 Is it one thread per vcpu?】

2013-03-13 Thread Laurent Desnogues
On Wed, Mar 13, 2013 at 10:51 AM, Peter Maydell
peter.mayd...@linaro.org wrote:
[...]
 For now, I need a emulator to emulate more cores efficiently.I want to
 analyze whether Qemu can be faster when it emulate more cores, while
 parallelism maybe a good choice.

 There has been some other research work with QEMU in this
 area, you should probably look it up. However it has all been
 we fixed enough of the big problems to be able to run some
 benchmarks and write our research paper, so none of this is
 actually in the QEMU sources. (The short answer I think is
 probably but it will depend a lot on workload, and there's
 a lot of software engineering required to do it reliably.)

I know of two such variants: COREMU and HQEMU, but as far
as I know the latter isn't freely available.


Laurent



Re: [Qemu-devel] [PATCH v2 0/9] tcg-arm improvements

2013-03-13 Thread Laurent Desnogues
On Tue, Mar 12, 2013 at 7:43 AM, Richard Henderson r...@twiddle.net wrote:
 Changes v1-v2:
   * Use more helper functions to handle K and N constraints.
   * Improve add2/sub2.
   * Improve epilogues, as suggested in the previous thread.
   * Fix a typo in the name of the deposit helper.
   * Implement division for cortex-a15.

I applied all of the patch series and tested nbench in user mode
on a quad core Cortex-A9 (which means I didn't test the div part
and the add2/sub2 part since it's i386 code).

It works and gave some nice speedup.

Thanks,

Laurent


 r~


 Richard Henderson (9):
   tcg-arm: Use bic to implement and with constant
   tcg-arm: Handle negated constant arguments to and/sub
   tcg-arm: Allow constant first argument to sub
   tcg-arm: Use tcg_out_dat_rIN for compares
   tcg-arm: Handle constant arguments to add2/sub2
   tcg-arm: Improve constant generation
   tcg-arm: Fold epilogue into INDEX_op_exit_tb
   tcg-arm: Implement deposit for armv7
   tcg-arm: Implement division instructions

  disas/arm.c  |   4 +
  tcg/arm/tcg-target.c | 410 
 ---
  tcg/arm/tcg-target.h |  14 +-
  3 files changed, 304 insertions(+), 124 deletions(-)

 --
 1.8.1.2





Re: [Qemu-devel] [PATCH 01/19] linux-user: stack_base is now mandatory on all targets

2013-03-07 Thread Laurent Desnogues
On Wed, Feb 8, 2012 at 10:46 AM, Laurent Desnogues
laurent.desnog...@gmail.com wrote:
 On Fri, Feb 3, 2012 at 3:49 PM,  riku.voi...@linaro.org wrote:
 From: Riku Voipio riku.voi...@linaro.org

 Signed-off-by: Riku Voipio riku.voi...@linaro.org
 ---
  linux-user/qemu.h |2 +-
  1 files changed, 1 insertions(+), 1 deletions(-)

 diff --git a/linux-user/qemu.h b/linux-user/qemu.h
 index 55ad9d8..30e2abd 100644
 --- a/linux-user/qemu.h
 +++ b/linux-user/qemu.h
 @@ -123,10 +123,10 @@ typedef struct TaskState {
  #endif
  #if defined(TARGET_ARM) || defined(TARGET_M68K) || defined(TARGET_UNICORE32)
 /* Extra fields for semihosted binaries.  */
 -uint32_t stack_base;
 uint32_t heap_base;
 uint32_t heap_limit;
  #endif
 +uint32_t stack_base;

 Shouldn't this be abi_ulong instead of uint32_t?

Ping...

Yes it's more than a year old, but if Aarch64 support with semihosting
ever comes to life :)

Laurent

 int used; /* non zero if used */
 struct image_info *info;
 struct linux_binprm *bprm;

 Laurent



Re: [Qemu-devel] [PATCH 03/12] ARM: Prepare translation for AArch64 code

2013-03-06 Thread Laurent Desnogues
On Wed, Mar 6, 2013 at 10:36 AM, Alexander Graf ag...@suse.de wrote:


 Am 06.03.2013 um 08:11 schrieb Laurent Desnogues 
 laurent.desnog...@gmail.com:

 On Wed, Mar 6, 2013 at 3:01 AM, Alexander Graf ag...@suse.de wrote:
 This patch adds all the prerequisites for AArch64 support that didn't
 fit into split up patches. It extends important bits in the core cpu
 headers to also take AArch64 mode into account.

 Signed-off-by: Alexander Graf ag...@suse.de
 ---
 include/elf.h  |2 +
 target-arm/cpu.h   |  103 
 
 target-arm/translate.c |   42 +++
 3 files changed, 103 insertions(+), 44 deletions(-)

 diff --git a/include/elf.h b/include/elf.h
 index a21ea53..0ff0ea6 100644
 --- a/include/elf.h
 +++ b/include/elf.h
 @@ -109,6 +109,8 @@ typedef int64_t  Elf64_Sxword;
 #define EM_OPENRISC 92/* OpenCores OpenRISC */

 #define EM_UNICORE32110 /* UniCore32 */
 +#define EM_AARCH64  183 /* ARM 64-bit architecture */
 +

 /*
  * This is an interim value that we will use until the committee comes
 diff --git a/target-arm/cpu.h b/target-arm/cpu.h
 index c28a0d9..ec292c9 100644
 --- a/target-arm/cpu.h
 +++ b/target-arm/cpu.h
 @@ -19,13 +19,19 @@
 #ifndef CPU_ARM_H
 #define CPU_ARM_H

 -#define TARGET_LONG_BITS 32
 +#include config.h

 -#define ELF_MACHINEEM_ARM
 +#if defined (TARGET_ARM64)
 +  /* AArch64 definitions */
 +#  define TARGET_LONG_BITS 64
 +#  define ELF_MACHINE  EM_AARCH64
 +#else
 +#  define TARGET_LONG_BITS 32
 +#  define ELF_MACHINE  EM_ARM
 +#endif

 #define CPUArchState struct CPUARMState

 -#include config.h
 #include qemu-common.h
 #include exec/cpu-defs.h

 @@ -79,6 +85,13 @@ struct arm_boot_info;
 typedef struct CPUARMState {
 /* Regs for current mode.  */
 uint32_t regs[16];
 +
 +/* Regs for A64 mode.  */
 +uint64_t xregs[32];

 I'm not sure it makes sense to allocate space for xregs[31].
 If it is the zero register then you would anyway have special
 cases in translation code to discard writes to reg 31 and so
 it could be argued you could also special case reads from
 reg 31 (and you should so that the optimizer knows it's zero).

 Perhaps you could use xregs[31] as SP, which after all is
 a more regular register than xzr.

 Sp is a separate env field in my patch set. So yeah, I should just drop 
 xregs[31] and always special case it I suppose. That's the best way to avoid 
 accidents I hope :)

I think that using xregs[31] as sp would ease code generation.
No need to special case both xzr and sp.

OTOH if you only declare 31 regs and you access xregs[31]
by accident you'd read/write pc which might help uncover
bugs faster, though I wouldn't want to use that feature to
ease development :-)


Laurent

 Alex



 Laurent

 +uint64_t pc;
 +uint64_t sp;
 +uint32_t pstate;
 +
 /* Frequently accessed CPSR bits are stored separately for efficiency.
This contains all the other bits.  Use cpsr_{read,write} to access
the whole CPSR.  */
 @@ -154,6 +167,11 @@ typedef struct CPUARMState {
 uint32_t c15_power_control; /* power control */
 } cp15;

 +/* System registers (AArch64) */
 +struct {
 +uint64_t tpidr_el0;
 +} sr;
 +
 struct {
 uint32_t other_sp;
 uint32_t vecbase;
 @@ -170,7 +188,7 @@ typedef struct CPUARMState {

 /* VFP coprocessor state.  */
 struct {
 -float64 regs[32];
 +float64 regs[64];

 uint32_t xregs[16];
 /* We store these fpcsr fields separately for convenience.  */
 @@ -241,6 +259,24 @@ int bank_number(int mode);
 void switch_mode(CPUARMState *, int);
 uint32_t do_arm_semihosting(CPUARMState *env);

 +static inline bool is_a64(CPUARMState *env)
 +{
 +#ifdef TARGET_ARM64
 +return true;
 +#else
 +return false;
 +#endif
 +}
 +
 +#define PSTATE_N_SHIFT 3
 +#define PSTATE_N  (1  PSTATE_N_SHIFT)
 +#define PSTATE_Z_SHIFT 2
 +#define PSTATE_Z  (1  PSTATE_Z_SHIFT)
 +#define PSTATE_C_SHIFT 1
 +#define PSTATE_C  (1  PSTATE_C_SHIFT)
 +#define PSTATE_V_SHIFT 0
 +#define PSTATE_V  (1  PSTATE_V_SHIFT)
 +
 /* you can call this signal handler from your SIGBUS and SIGSEGV
signal handlers to inform the virtual CPU of exceptions. non zero
is returned if the signal was handled by the virtual CPU.  */
 @@ -624,8 +660,13 @@ static inline bool cp_access_ok(CPUARMState *env,
 #define TARGET_PAGE_BITS 10
 #endif

 -#define TARGET_PHYS_ADDR_SPACE_BITS 40
 -#define TARGET_VIRT_ADDR_SPACE_BITS 32
 +#if defined (TARGET_ARM64)
 +#  define TARGET_PHYS_ADDR_SPACE_BITS 64
 +#  define TARGET_VIRT_ADDR_SPACE_BITS 64
 +#else
 +#  define TARGET_PHYS_ADDR_SPACE_BITS 40
 +#  define TARGET_VIRT_ADDR_SPACE_BITS 32
 +#endif

 static inline CPUARMState *cpu_init(const char *cpu_model)
 {
 @@ -699,25 +740,31 @@ static inline void cpu_clone_regs(CPUARMState *env, 
 target_ulong newsp)
 static inline void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc

Re: [Qemu-devel] [PATCH 04/12] ARM: Add AArch64 translation stub

2013-03-05 Thread Laurent Desnogues
(This mail was previously sent by accident to Alexander only.)

On Wed, Mar 6, 2013 at 3:01 AM, Alexander Graf ag...@suse.de wrote:
 We should translate AArch64 mode separately from AArch32 mode. In AArch64 
 mode,
 registers look vastly different, instruction encoding is completely different,
 basically the system turns into a different machine.

 So let's do a simple if() in translate.c to decide whether we can handle the
 current code in the legacy AArch32 code or in the new AArch64 code.

 So far, the translation always complains about unallocated instructions. There
 is no emulator functionality in this patch!

 Signed-off-by: Alexander Graf ag...@suse.de
 ---
  target-arm/Makefile.objs   |1 +
  target-arm/translate-a64.c |  139 
 
  target-arm/translate.c |   15 +
  target-arm/translate.h |6 ++
  4 files changed, 161 insertions(+), 0 deletions(-)
  create mode 100644 target-arm/translate-a64.c

 diff --git a/target-arm/Makefile.objs b/target-arm/Makefile.objs
 index d89b57c..f9f0de4 100644
 --- a/target-arm/Makefile.objs
 +++ b/target-arm/Makefile.objs
 @@ -3,3 +3,4 @@ obj-$(CONFIG_SOFTMMU) += machine.o
  obj-$(CONFIG_KVM) += kvm.o
  obj-y += translate.o op_helper.o helper.o cpu.o
  obj-y += neon_helper.o iwmmxt_helper.o
 +obj-$(TARGET_ARM64) += translate-a64.o
 diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
 new file mode 100644
 index 000..f354d08
 --- /dev/null
 +++ b/target-arm/translate-a64.c
 @@ -0,0 +1,139 @@
 +/*
 + *  AArch64 translation
 + *
 + *  Copyright (c) 2013 Alexander Graf ag...@suse.de
 + *
 + * This library is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2 of the License, or (at your option) any later version.
 + *
 + * This library is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with this library; if not, see 
 http://www.gnu.org/licenses/.
 + */
 +#include stdarg.h
 +#include stdlib.h
 +#include stdio.h
 +#include string.h
 +#include inttypes.h
 +
 +#include cpu.h
 +#include tcg-op.h
 +#include qemu/log.h
 +#include translate.h
 +#include qemu/host-utils.h
 +
 +#include helper.h
 +#define GEN_HELPER 1
 +#include helper.h
 +
 +static TCGv_i64 cpu_X[32];
 +static TCGv_i64 cpu_pc;
 +static TCGv_i64 cpu_sp;
 +static TCGv_i32 pstate;
 +
 +static const char *regnames[] =
 +{ x0, x1, x2, x3, x4, x5, x6, x7,
 +  x8, x9, x10, x11, x12, x13, x14, x15,
 +  x16, x17, x18, x19, x20, x21, x22, x23,
 +  x24, x25, x26, x27, x28, x29, lr, xzr };

Wouldn't it be better to use sp instead of xzr?  I would
expect the translator not to emit TCG ops with reg 31
meaning zero :-)


Laurent

 +
 +/* initialize TCG globals.  */
 +void a64_translate_init(void)
 +{
 +int i;
 +
 +cpu_pc = tcg_global_mem_new_i64(TCG_AREG0,
 +offsetof(CPUARMState, pc),
 +pc);
 +cpu_sp = tcg_global_mem_new_i64(TCG_AREG0,
 +offsetof(CPUARMState, sp),
 +sp);
 +for (i = 0; i  32; i++) {
 +cpu_X[i] = tcg_global_mem_new_i64(TCG_AREG0,
 +  offsetof(CPUARMState, xregs[i]),
 +  regnames[i]);
 +}
 +
 +pstate = tcg_global_mem_new_i32(TCG_AREG0,
 +offsetof(CPUARMState, pstate),
 +pstate);
 +}
 +
 +void cpu_dump_state_a64(CPUARMState *env, FILE *f, fprintf_function 
 cpu_fprintf,
 +int flags)
 +{
 +int i;
 +
 +cpu_fprintf(f, PC=%016PRIx64  SP=%016PRIx64\n, env-pc, env-sp);
 +for(i = 0; i  31; i++) {
 +cpu_fprintf(f, X%02d=%016PRIx64, i, env-xregs[i]);
 +if ((i % 4) == 3)
 +cpu_fprintf(f, \n);
 +else
 +cpu_fprintf(f,  );
 +}
 +cpu_fprintf(f, XZR=%016PRIx64\n, env-xregs[31]);
 +cpu_fprintf(f, PSTATE=%c%c%c%c\n,
 +env-pstate  PSTATE_N ? 'n' : '.',
 +env-pstate  PSTATE_Z ? 'z' : '.',
 +env-pstate  PSTATE_C ? 'c' : '.',
 +env-pstate  PSTATE_V ? 'v' : '.');
 +cpu_fprintf(f, \n);
 +}
 +
 +void gen_a64_set_pc_im(uint64_t val)
 +{
 +tcg_gen_movi_i64(cpu_pc, val);
 +}
 +
 +static void gen_exception(int excp)
 +{
 +TCGv_i32 tmp = tcg_temp_new_i32();
 +tcg_gen_movi_i32(tmp, excp);
 +gen_helper_exception(cpu_env, tmp);
 +tcg_temp_free_i32(tmp);
 +}
 +
 +static void gen_exception_insn(DisasContext *s, int offset, int excp)
 +{
 +

Re: [Qemu-devel] [PATCH 03/12] ARM: Prepare translation for AArch64 code

2013-03-05 Thread Laurent Desnogues
On Wed, Mar 6, 2013 at 3:01 AM, Alexander Graf ag...@suse.de wrote:
 This patch adds all the prerequisites for AArch64 support that didn't
 fit into split up patches. It extends important bits in the core cpu
 headers to also take AArch64 mode into account.

 Signed-off-by: Alexander Graf ag...@suse.de
 ---
  include/elf.h  |2 +
  target-arm/cpu.h   |  103 
 
  target-arm/translate.c |   42 +++
  3 files changed, 103 insertions(+), 44 deletions(-)

 diff --git a/include/elf.h b/include/elf.h
 index a21ea53..0ff0ea6 100644
 --- a/include/elf.h
 +++ b/include/elf.h
 @@ -109,6 +109,8 @@ typedef int64_t  Elf64_Sxword;
  #define EM_OPENRISC 92/* OpenCores OpenRISC */

  #define EM_UNICORE32110 /* UniCore32 */
 +#define EM_AARCH64  183 /* ARM 64-bit architecture */
 +

  /*
   * This is an interim value that we will use until the committee comes
 diff --git a/target-arm/cpu.h b/target-arm/cpu.h
 index c28a0d9..ec292c9 100644
 --- a/target-arm/cpu.h
 +++ b/target-arm/cpu.h
 @@ -19,13 +19,19 @@
  #ifndef CPU_ARM_H
  #define CPU_ARM_H

 -#define TARGET_LONG_BITS 32
 +#include config.h

 -#define ELF_MACHINEEM_ARM
 +#if defined (TARGET_ARM64)
 +  /* AArch64 definitions */
 +#  define TARGET_LONG_BITS 64
 +#  define ELF_MACHINE  EM_AARCH64
 +#else
 +#  define TARGET_LONG_BITS 32
 +#  define ELF_MACHINE  EM_ARM
 +#endif

  #define CPUArchState struct CPUARMState

 -#include config.h
  #include qemu-common.h
  #include exec/cpu-defs.h

 @@ -79,6 +85,13 @@ struct arm_boot_info;
  typedef struct CPUARMState {
  /* Regs for current mode.  */
  uint32_t regs[16];
 +
 +/* Regs for A64 mode.  */
 +uint64_t xregs[32];

I'm not sure it makes sense to allocate space for xregs[31].
If it is the zero register then you would anyway have special
cases in translation code to discard writes to reg 31 and so
it could be argued you could also special case reads from
reg 31 (and you should so that the optimizer knows it's zero).

Perhaps you could use xregs[31] as SP, which after all is
a more regular register than xzr.


Laurent

 +uint64_t pc;
 +uint64_t sp;
 +uint32_t pstate;
 +
  /* Frequently accessed CPSR bits are stored separately for efficiency.
 This contains all the other bits.  Use cpsr_{read,write} to access
 the whole CPSR.  */
 @@ -154,6 +167,11 @@ typedef struct CPUARMState {
  uint32_t c15_power_control; /* power control */
  } cp15;

 +/* System registers (AArch64) */
 +struct {
 +uint64_t tpidr_el0;
 +} sr;
 +
  struct {
  uint32_t other_sp;
  uint32_t vecbase;
 @@ -170,7 +188,7 @@ typedef struct CPUARMState {

  /* VFP coprocessor state.  */
  struct {
 -float64 regs[32];
 +float64 regs[64];

  uint32_t xregs[16];
  /* We store these fpcsr fields separately for convenience.  */
 @@ -241,6 +259,24 @@ int bank_number(int mode);
  void switch_mode(CPUARMState *, int);
  uint32_t do_arm_semihosting(CPUARMState *env);

 +static inline bool is_a64(CPUARMState *env)
 +{
 +#ifdef TARGET_ARM64
 +return true;
 +#else
 +return false;
 +#endif
 +}
 +
 +#define PSTATE_N_SHIFT 3
 +#define PSTATE_N  (1  PSTATE_N_SHIFT)
 +#define PSTATE_Z_SHIFT 2
 +#define PSTATE_Z  (1  PSTATE_Z_SHIFT)
 +#define PSTATE_C_SHIFT 1
 +#define PSTATE_C  (1  PSTATE_C_SHIFT)
 +#define PSTATE_V_SHIFT 0
 +#define PSTATE_V  (1  PSTATE_V_SHIFT)
 +
  /* you can call this signal handler from your SIGBUS and SIGSEGV
 signal handlers to inform the virtual CPU of exceptions. non zero
 is returned if the signal was handled by the virtual CPU.  */
 @@ -624,8 +660,13 @@ static inline bool cp_access_ok(CPUARMState *env,
  #define TARGET_PAGE_BITS 10
  #endif

 -#define TARGET_PHYS_ADDR_SPACE_BITS 40
 -#define TARGET_VIRT_ADDR_SPACE_BITS 32
 +#if defined (TARGET_ARM64)
 +#  define TARGET_PHYS_ADDR_SPACE_BITS 64
 +#  define TARGET_VIRT_ADDR_SPACE_BITS 64
 +#else
 +#  define TARGET_PHYS_ADDR_SPACE_BITS 40
 +#  define TARGET_VIRT_ADDR_SPACE_BITS 32
 +#endif

  static inline CPUARMState *cpu_init(const char *cpu_model)
  {
 @@ -699,25 +740,31 @@ static inline void cpu_clone_regs(CPUARMState *env, 
 target_ulong newsp)
  static inline void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
  target_ulong *cs_base, int *flags)
  {
 -int privmode;
 -*pc = env-regs[15];
 -*cs_base = 0;
 -*flags = (env-thumb  ARM_TBFLAG_THUMB_SHIFT)
 -| (env-vfp.vec_len  ARM_TBFLAG_VECLEN_SHIFT)
 -| (env-vfp.vec_stride  ARM_TBFLAG_VECSTRIDE_SHIFT)
 -| (env-condexec_bits  ARM_TBFLAG_CONDEXEC_SHIFT)
 -| (env-bswap_code  ARM_TBFLAG_BSWAP_CODE_SHIFT);
 -if (arm_feature(env, ARM_FEATURE_M)) {
 -privmode = !((env-v7m.exception == 0)  (env-v7m.control  1));
 +if (is_a64(env)) {
 +*pc = env-pc;
 +*flags = 0;
  } 

Re: [Qemu-devel] [PATCH] glib: Add compat wrapper for g_poll on old glib

2013-02-26 Thread Laurent Desnogues
On Tue, Feb 26, 2013 at 12:46 AM, Alexander Graf ag...@suse.de wrote:
 Older glib doesn't implement g_poll(). Most notably the glib version in use
 on SLE11 is on 2.18 which is hit by this.

 We do want to use g_poll() in the source however. So on older systems, just
 wrap it with functions that do exist on older versions.

That compiles fine on my old CentOS workstation.

Thanks,

Laurent

 Signed-off-by: Anthony Liguori aligu...@us.ibm.com
 Signed-off-by: Alexander Graf ag...@suse.de
 ---
  include/qemu-common.h |   12 
  1 files changed, 12 insertions(+), 0 deletions(-)

 diff --git a/include/qemu-common.h b/include/qemu-common.h
 index 80016ad..5e13708 100644
 --- a/include/qemu-common.h
 +++ b/include/qemu-common.h
 @@ -142,6 +142,18 @@ int qemu_main(int argc, char **argv, char **envp);
  void qemu_get_timedate(struct tm *tm, int offset);
  int qemu_timedate_diff(struct tm *tm);

 +#if !GLIB_CHECK_VERSION(2, 20, 0)
 +/*
 + * Glib before 2.20.0 doesn't implement g_poll, so wrap it to compile 
 properly
 + * on older systems.
 + */
 +static inline gint g_poll(GPollFD *fds, guint nfds, gint timeout)
 +{
 +GMainContext *ctx = g_main_context_default();
 +return g_main_context_get_poll_func(ctx)(fds, nfds, timeout);
 +}
 +#endif
 +
  /**
   * is_help_option:
   * @s: string to test
 --
 1.6.0.2




Re: [Qemu-devel] [PATCH] arm/translate.c: Fix adc_CC/sbc_CC implementation

2013-02-25 Thread Laurent Desnogues
On Mon, Feb 25, 2013 at 3:43 PM, Richard Henderson r...@twiddle.net wrote:
 On 2013-02-25 00:04, Peter Crosthwaite wrote:

 commits 49b4c31efcce45ab714f286f14fa5d5173f9069d and
 2de68a4900ef6eb67380b0c128abfe1976bc66e8 reworked the implementation of
 adc_CC
 and sub_CC. The new implementations (on the TCG_TARGET_HAS_add2_i32 code
 path)
 are incorrect. The new logic is:

 CF:NF = 0:A +/- 0:CF
 CF:NF = CF:A +/- 0:B

 The lower 32 bits of the intermediate result stored in NF needs to be
 passes
 into the second addition in place of A (s/CF:A/CF:NF):

 CF:NF = 0:A +/- 0:CF
 CF:NF = CF:NF +/- 0:B

 This patch fixes the issue.

 Signed-off-by: Peter Crosthwaitepeter.crosthwa...@xilinx.com
 ---
   target-arm/translate.c |4 ++--
   1 files changed, 2 insertions(+), 2 deletions(-)


 Reviewed-by: Richard Henderson r...@twiddle.net

 Sorry for the breakage.  Blue, please apply asap.

I'm afraid this fix is not enough as I still can't get my Linux image
to boot after applying it.

Running this, my image boots:
git checkout 49b4c31efcce45ab714f286f14fa5d5173f9069d target-arm

Looking at the new sbc_cc, it seems that if t0=t1 and CF=1,
then CF will be cleared while the old code in the helper did
set it.


Laurent

PS: My image is the vexpress found here:
http://releases.linaro.org/images/linaro-n/alip/11.08



Re: [Qemu-devel] [PATCH v2 0/3] Fix two carry problems in add2 patchset

2013-02-25 Thread Laurent Desnogues
On Mon, Feb 25, 2013 at 8:01 PM, Richard Henderson r...@twiddle.net wrote:
 V1-V2:
   Dangit, that's what trying to be speedy gets you, failing to make
   all of the changes you wrote in the comment.  This time actually
   fix the sub2 to add2 like I intended, not just the not portion.

 The patch set is re-pushed at

   git://github.com/rth7680/qemu.git fix-carry

 Please *review* asap.

My kernel is happier now :-)

Thanks,

Laurent


 r~


 Peter Crosthwaite (1):
   arm/translate.c: Fix adc_CC/sbc_CC implementation

 Richard Henderson (2):
   target-arm: Fix sbc_CC carry
   target-ppc: Fix SUBFE carry

  target-arm/translate.c | 10 +-
  target-ppc/translate.c |  9 +
  2 files changed, 10 insertions(+), 9 deletions(-)

 --
 1.8.1.2





Re: [Qemu-devel] [PATCH v4 00/10] main-loop: switch to g_poll(3) on POSIX hosts

2013-02-22 Thread Laurent Desnogues
On Wed, Feb 20, 2013 at 11:28 AM, Stefan Hajnoczi stefa...@redhat.com wrote:
 Amos Kong ak...@redhat.com reported that file descriptors numbered higher
 than 1024 could crash QEMU.  This is due to the fixed size of the fd_set type
 used for select(2) event polling.

 This series converts the main-loop.c and aio-posix.c select(2) calls to
 g_poll(3).  This eliminates the fd_set type and allows QEMU to scale to high
 numbers of file descriptors.

 The g_poll(3) interface is a portable version of the poll(2) system call.  The
 difference to select(2) is that fine-grained events (G_IO_IN, G_IO_OUT,
 G_IO_HUP, G_IO_ERR, G_IO_PRI) can be monitored instead of just
 read/write/exception.  Also, there is no limit to the file descriptor numbers
 that may be used, allowing applications to scale to many file descriptors.  
 See
 the documentation for details:

   http://developer.gnome.org/glib/2.28/glib-The-Main-Event-Loop.html#g-poll

g_poll isn't available in older glib versions used by CentOS (2.12
on CentOS 5.6).

Thanks,

Laurent

 The QEMU main loop works as follows today:

 1. Call out to slirp, iohandlers, and glib sources to fill rfds/wfds/xfds with
the file descriptors to select(2).
 2. Perform the select(2) call.
 3. Call out to slirp, iohandlers, and glib sources to handle events polled in
rfds/wfds/xfds.

 The plan of attack is as follows:

 1. Replace select(2) with g_poll(3).  Use glue that converts between
rfds/wfds/xfds and GPollFD so that the unconverted QEMU components still
work.

 2. Convert slirp, iohandlers, and glib source fill/poll functions to use
GPollFD directly instead of rfds/wfds/xfds.

 3. Drop the glue since all components now natively use GPollFD.

 4. Convert aio-posix.c to g_poll(3) by reusing GPollFD.

 I have tested that the series builds and is bisectable on Linux and Windows
 hosts.  But I have not done extensive testing on other host platforms or with
 long-term guests to check for performance regressions.

 v4:
  * Assign revents instead of bitwise OR to make code clearer [Laszlo]
  * Invoke pollfds_poll() only when select(2) succeeds [Laszlo]
  * Fix gpollfds_to_select(select_ret || g_poll_ret) call [Laszlo]

 v3:
  * Convert slirp/slirp.c to tabs to spaces [Blue Swirl]

 v2:
  * Replace custom Poller type with GArray [aliguori]

 Stefan Hajnoczi (10):
   main-loop: fix select_ret uninitialized variable warning
   main-loop: switch to g_poll() on POSIX hosts
   main-loop: switch POSIX glib integration to GPollFD
   slirp: slirp/slirp.c coding style cleanup
   slirp: switch to GPollFD
   iohandler: switch to GPollFD
   main-loop: drop rfds/wfds/xfds for good
   aio: extract aio_dispatch() from aio_poll()
   aio: convert aio_poll() to g_poll(3)
   aio: support G_IO_HUP and G_IO_ERR

  aio-posix.c  | 130 -
  async.c  |   2 +
  include/block/aio.h  |   3 +
  include/qemu/main-loop.h |   4 +-
  iohandler.c  |  40 ++-
  main-loop.c  | 158 ++-
  slirp/libslirp.h |   6 +-
  slirp/main.h |   1 -
  slirp/slirp.c| 673 
 +--
  slirp/socket.c   |   9 -
  slirp/socket.h   |   2 +
  stubs/slirp.c|   6 +-
  12 files changed, 547 insertions(+), 487 deletions(-)

 --
 1.8.1.2





Re: [Qemu-devel] GTK UI is now the default

2013-02-22 Thread Laurent Desnogues
On Fri, Feb 22, 2013 at 12:04 AM, Anthony Liguori aligu...@us.ibm.com wrote:

 Since this is a pretty visible change for a lot of people, I thought I'd
 send a top level note.  The GTK UI is now committed and is the default
 UI provided it's available.

 For anyone counting, it's been a little more than 7 years in the making:

 http://article.gmane.org/gmane.comp.emulators.qemu/9726

 If you want to try it, make sure you have the gtk-2.0 development
 packages installed and the VTE development packages.

It looks like gtk_widget_get_window isn't available in all gtk 2 versions,
at least it's not in the gtk2 2.10 on my CentOS 5.6 system.

Thanks,

Laurent



Re: [Qemu-devel] GTK UI is now the default

2013-02-22 Thread Laurent Desnogues
On Fri, Feb 22, 2013 at 11:36 AM, Daniel P. Berrange
berra...@redhat.com wrote:
 On Fri, Feb 22, 2013 at 11:12:35AM +0100, Laurent Desnogues wrote:
 On Fri, Feb 22, 2013 at 12:04 AM, Anthony Liguori aligu...@us.ibm.com 
 wrote:
 
  Since this is a pretty visible change for a lot of people, I thought I'd
  send a top level note.  The GTK UI is now committed and is the default
  UI provided it's available.
 
  For anyone counting, it's been a little more than 7 years in the making:
 
  http://article.gmane.org/gmane.comp.emulators.qemu/9726
 
  If you want to try it, make sure you have the gtk-2.0 development
  packages installed and the VTE development packages.

 It looks like gtk_widget_get_window isn't available in all gtk 2 versions,
 at least it's not in the gtk2 2.10 on my CentOS 5.6 system.

 The docs say since 2.14

 http://developer.gnome.org/gtk3/stable/GtkWidget.html#gtk-widget-get-window

 IIRC before that you could simply refernece  widget-window directly.
 So you could probably try adding something like this:

   #if ! GTK_CHECK_VERSION(2, 14, 0)
   #define gtk_widget_get_window(w) (w)-window
   #endif

 and see if that fixes your build.

That fixes the first error, but the others remain:

ui/gtk.c: In function 'gd_resize':
ui/gtk.c:254: warning: implicit declaration of function
'cairo_format_stride_for_width'
ui/gtk.c:254: warning: nested extern declaration of
'cairo_format_stride_for_width'
ui/gtk.c: In function 'gd_draw_event':
ui/gtk.c:355: warning: implicit declaration of function
'gtk_widget_get_realized'
ui/gtk.c:355: warning: nested extern declaration of 'gtk_widget_get_realized'
ui/gtk.c: In function 'gd_vc_init':
ui/gtk.c:864: error: 'VtePty' undeclared (first use in this function)
ui/gtk.c:864: error: (Each undeclared identifier is reported only once
ui/gtk.c:864: error: for each function it appears in.)
ui/gtk.c:864: error: 'pty' undeclared (first use in this function)
ui/gtk.c:885: error: 'GDK_KEY_2' undeclared (first use in this function)
ui/gtk.c:897: warning: implicit declaration of function 'vte_pty_new_foreign'
ui/gtk.c:897: warning: nested extern declaration of 'vte_pty_new_foreign'
ui/gtk.c:899: warning: implicit declaration of function
'vte_terminal_set_pty_object'
ui/gtk.c:899: warning: nested extern declaration of
'vte_terminal_set_pty_object'
ui/gtk.c: In function 'gd_create_menus':
ui/gtk.c:1009: error: 'GDK_KEY_f' undeclared (first use in this function)
ui/gtk.c:1018: error: 'GDK_KEY_plus' undeclared (first use in this function)
ui/gtk.c:1024: error: 'GDK_KEY_minus' undeclared (first use in this function)
ui/gtk.c:1030: error: 'GDK_KEY_0' undeclared (first use in this function)
ui/gtk.c:1045: error: 'GDK_KEY_g' undeclared (first use in this function)
ui/gtk.c:1055: error: 'GDK_KEY_1' undeclared (first use in this function)
ui/gtk.c: In function 'gtk_display_init':
ui/gtk.c:1109: error: 'GDK_BLANK_CURSOR' undeclared (first use in this function)
ui/gtk.c:1131: warning: implicit declaration of function
'gtk_widget_set_can_focus'
ui/gtk.c:1131: warning: nested extern declaration of 'gtk_widget_set_can_focus'

Thanks,

Laurent



Re: [Qemu-devel] [PATCH for-1.4] linux-user: Restore cast to target type in get_user()

2013-01-31 Thread Laurent Desnogues
On Thu, Jan 31, 2013 at 1:50 PM, Peter Maydell peter.mayd...@linaro.org wrote:
 Commit 658f2dc97 accidentally dropped the cast to the target type of
 the value loaded by get_user().  The most visible effect of this would
 be that the sequence uint64_t v; get_user_u32(v, addr) would sign
 extend the 32 bit loaded value into v rather than zero extending as
 would be expected for a _u32 accessor.  Put the cast back again to
 restore the old behaviour.

This fixes the issue I mentioned a week ago, thanks.

Reviewed-by: Laurent Desnogues laurent.desnog...@gmail.com


Laurent

 Signed-off-by: Peter Maydell peter.mayd...@linaro.org
 ---
  linux-user/qemu.h |4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

 diff --git a/linux-user/qemu.h b/linux-user/qemu.h
 index 31a220a..b10e957 100644
 --- a/linux-user/qemu.h
 +++ b/linux-user/qemu.h
 @@ -306,12 +306,12 @@ static inline int access_ok(int type, abi_ulong addr, 
 abi_ulong size)
   ((hptr), (x)), 0)

  #define __get_user_e(x, hptr, e)\
 -  ((x) =\
 +  ((x) = (typeof(*hptr))(   \
 __builtin_choose_expr(sizeof(*(hptr)) == 1, ldub_p,  \
 __builtin_choose_expr(sizeof(*(hptr)) == 2, lduw_##e##_p,\
 __builtin_choose_expr(sizeof(*(hptr)) == 4, ldl_##e##_p, \
 __builtin_choose_expr(sizeof(*(hptr)) == 8, ldq_##e##_p, abort   \
 - (hptr), 0)
 + (hptr)), 0)

  #ifdef TARGET_WORDS_BIGENDIAN
  # define __put_user(x, hptr)  __put_user_e(x, hptr, be)
 --
 1.7.9.5




Re: [Qemu-devel] [PATCH 00/57] target-i386 eflags cleanup and bmi/adx extensions

2013-01-26 Thread Laurent Desnogues
On Thu, Jan 24, 2013 at 9:37 PM, Richard Henderson r...@twiddle.net wrote:
 On 01/24/2013 08:57 AM, Laurent Desnogues wrote:

 On Thu, Jan 24, 2013 at 5:52 PM, Richard Henderson r...@twiddle.net
 wrote:

 On 2013-01-24 08:46, Laurent Desnogues wrote:


 I gave a quick try a your branch. My host is an x86_64 CPU and I
 ran an i386 nbench in user mode.  It works but some parts of the
 benchmark are noticeably slower (10%).  Is that expected?


 Nope.  Everything in there should be about speeding up...

 I'll have a look at it and see if there's something obvious.


 Let me know if you need more information or the binary (I compiled
 it some time ago with the oldest compiler I could find, gcc 2.96).


 Would you look and see how much variability you're getting?  I had a quick
 look with a (newly built) nbench binary and don't see any speed regressions
 outside the error bars.

 Built with gcc 4.7.2, 4 trials each:

 Master  Eflags3
 Avg Stddev  Avg Stddev
 Change Error
 Num S   585.92  18.25   573.79  4.39
 -2.07% 3.12%
 String S51.14   1.1051.52   0.13
 0.73% 2.15%
 Bitfield1.64E+008   4.04E+006   1.62E+008   8.63E+005
 -1.32% 2.46%
 FP Emu  85.65   1.81114.74  1.18
 33.97% 2.12%
 Fourier 1365.03 28.79   2813.78 11.72
 106.13% 2.11%
 Assign  14.86   0.2414.89   0.21
 0.22% 1.62%
 Idea723.70  43.31   884.20  4.55
 22.18% 5.98%
 Huff495.27  8.72702.89  3.53
 41.92% 1.76%
 N Net   0.290.010.730.00
 149.99% 1.78%
 LU Decomp   9.260.1621.91   0.22
 136.61% 1.70%


 I haven't looked to see where the massive fp improvements come from, but my
 first guess is not storing cc_op so often.  Although perhaps it would keep
 us on the same page if we were talking about the exact same binary...

Here are my results (5 runs):

  master eflags3eflags3
   stddev stddev/master
NUMERIC SO  488.59  3.82  507.37  3.871.04
STRING SOR   42.10  0.27   43.51  0.131.03
BITFIELD  105344000.00 885426.45 91472800.00 426835.680.87
FP EMULATI   22.70  0.54   23.07  0.381.02
FOURIER2669.56 16.12 2576.34 29.460.97
ASSIGNMENT8.40  0.117.68  0.020.91
IDEA   1535.88  7.73 1620.72  3.801.06
HUFFMAN 212.21  2.08  214.34  1.281.01
NEURAL NET0.75  0.000.76  0.001.01
LU DECOMPO   24.25  0.06   24.75  0.081.02

Host is an i7-920 running gcc 4.6.1 x86_64.
nbench was compiled with gcc 2.96 targetting 32-bit.  The binary
I have is old and has been stripped and I alas have no access to
gcc 2.96 anymore.  Note that using a more recent compiler doesn't
show that much difference.


Laurent



Re: [Qemu-devel] [PATCH 00/57] target-i386 eflags cleanup and bmi/adx extensions

2013-01-24 Thread Laurent Desnogues
On Thu, Jan 24, 2013 at 5:02 AM, Richard Henderson r...@twiddle.net wrote:
 This is a re-working of Paolo's eflags cleanup from October, which
 I consider a pre-requisite to implementing the ADX extension.  I've
 rearranged most of the patches in trivial ways, and some quite
 significantly.

 I've tested the result by running the FC17 installer in both i386
 and x86_64 mode, and the bmi/adx extensions with small user-land
 test cases.

 The patch series is at

   git://github.com/rth7680/qemu.git eflags3

I gave a quick try a your branch. My host is an x86_64 CPU and I
ran an i386 nbench in user mode.  It works but some parts of the
benchmark are noticeably slower (10%).  Is that expected?

Thanks,

Laurent

 Please review.


 r~


 Paolo Bonzini (19):
   test-i386: QEMU_PACKED is not defined here
   test-i386: make it compile with a recent gcc
   target-i386: use OT_* consistently
   target-i386: introduce gen_ext_tl
   target-i386: factor setting of s-cc_op handling for string functions
   target-i386: drop cc_op argument of gen_jcc1
   target-i386: move carry computation for inc/dec closer to
 gen_op_set_cc_op
   target-i386: move eflags computation closer to gen_op_set_cc_op
   target-i386: compute eflags outside rcl/rcr helper
   target-i386: clean up sahf
   target-i386: use gen_jcc1 to compile loopz
   target-i386: factor gen_op_set_cc_op/tcg_gen_discard_tl around
 computing flags
   target-i386: add helper functions to get other flags
   target-i386: change gen_setcc_slow_T0 to gen_setcc_slow
   target-i386: optimize setcc instructions
   target-i386: use CCPrepare to generate conditional jumps
   target-i386: cleanup temporary macros for CCPrepare
   target-i386: introduce gen_cmovcc1
   target-i386: kill cpu_T3

 Richard Henderson (38):
   target-i386: Name the cc_op enumeration
   target-i386: Introduce set_cc_op
   target-i386: Don't clobber s-cc_op in gen_update_cc_op
   target-i386: Use gen_update_cc_op everywhere
   target-i386: do not compute eflags multiple times consecutively
   target-i386: no need to flush out cc_op before gen_eob
   target-i386: Move CC discards to set_cc_op
   target-i386: do not call helper to compute ZF/SF
   target-i386: use inverted setcond when computing NS or NZ
   target-i386: convert gen_compute_eflags_c to TCG
   target-i386: optimize setbe
   target-i386: optimize setle
   target-i386: introduce CCPrepare
   target-i386: introduce gen_prepare_cc
   target-i386: inline gen_prepare_cc_slow
   target-i386: expand cmov via movcond
   target-i386: use gen_op for cmps/scas
   target-i386: introduce gen_jcc1_noeob
   target-i386: Update cc_op before TCG branches
   target-i386: optimize flags checking after sub using CC_SRC2
   target-i386: Use CC_SRC2 for ADC and SBB
   target-i386: Don't reference ENV through most of cc helpers
   target-i386: Make helper_cc_compute_all const
   target-i386: Tidy prefix parsing
   target-i386: Decode the VEX prefixes
   target-i386: Implement MOVBE
   target-i386: Implement ANDN
   target-i386: Implement BEXTR
   target-i386: Implement BLSR, BLSMSK, BLSI
   target-i386: Implement BZHI
   target-i386: Implement MULX
   target-i386: Implement PDEP, PEXT
   target-i386: Implement SHLX, SARX, SHRX
   target-i386: Implement RORX
   target-i386: Implement ADX extension
   target-i386: Use clz/ctz for bsf/bsr helpers
   target-i386: Simplify bsf/bsr flags computation
   target-i386: Implement tzcnt and fix lzcnt

  target-i386/cc_helper.c |  243 ++--
  target-i386/cc_helper_template.h|  268 ++---
  target-i386/cpu.c   |   18 +-
  target-i386/cpu.h   |   24 +-
  target-i386/helper.c|   11 +-
  target-i386/helper.h|   12 +-
  target-i386/int_helper.c|   69 +-
  target-i386/shift_helper_template.h |   12 +-
  target-i386/translate.c | 2195 
 +--
  tests/tcg/test-i386.c   |   10 +-
  10 files changed, 1581 insertions(+), 1281 deletions(-)

 --
 1.7.11.7





Re: [Qemu-devel] [PATCH 6/8] linux-user: Rewrite __get_user/__put_user with __builtin_choose_expr

2013-01-23 Thread Laurent Desnogues
Hi,

I know this was committed some time ago, but I have a
comment about this patch.

On Sat, Jan 5, 2013 at 1:39 AM, Richard Henderson r...@twiddle.net wrote:
 The previous formuation with multiple assignments to __typeof(*hptr) falls
 down when hptr is qualified const.  E.g. with const struct S *p, p-f is
 also qualified const.

 With this formulation, there's no assignment to any local variable.

 Signed-off-by: Richard Henderson r...@twiddle.net
 ---
  linux-user/qemu.h | 63 
 +--
  1 file changed, 33 insertions(+), 30 deletions(-)

 diff --git a/linux-user/qemu.h b/linux-user/qemu.h
 index 8a3538c..31a220a 100644
 --- a/linux-user/qemu.h
 +++ b/linux-user/qemu.h
 @@ -287,36 +287,39 @@ static inline int access_ok(int type, abi_ulong addr, 
 abi_ulong size)
  (type == VERIFY_READ) ? PAGE_READ : (PAGE_READ | 
 PAGE_WRITE)) == 0;
  }

 -/* NOTE __get_user and __put_user use host pointers and don't check access. 
 */
 -/* These are usually used to access struct data members once the
 - * struct has been locked - usually with lock_user_struct().
 - */
 -#define __put_user(x, hptr)\
 -({ __typeof(*hptr) pu_ = (x);\
 -switch(sizeof(*hptr)) {\
 -case 1: break;\
 -case 2: pu_ = tswap16(pu_); break; \
 -case 4: pu_ = tswap32(pu_); break; \
 -case 8: pu_ = tswap64(pu_); break; \
 -default: abort();\
 -}\
 -memcpy(hptr, pu_, sizeof(pu_)); \
 -0;\
 -})
 -
 -#define __get_user(x, hptr) \
 -({ __typeof(*hptr) gu_; \
 -memcpy(gu_, hptr, sizeof(gu_)); \
 -switch(sizeof(*hptr)) {\
 -case 1: break; \
 -case 2: gu_ = tswap16(gu_); break; \
 -case 4: gu_ = tswap32(gu_); break; \
 -case 8: gu_ = tswap64(gu_); break; \
 -default: abort();\
 -}\
 -(x) = gu_; \
 -0;\
 -})
 +/* NOTE __get_user and __put_user use host pointers and don't check access.
 +   These are usually used to access struct data members once the struct has
 +   been locked - usually with lock_user_struct.  */
 +
 +/* Tricky points:
 +   - Use __builtin_choose_expr to avoid type promotion from ?:,
 +   - Invalid sizes result in a compile time error stemming from
 + the fact that abort has no parameters.
 +   - It's easier to use the endian-specific unaligned load/store
 + functions than host-endian unaligned load/store plus tswapN.  */
 +
 +#define __put_user_e(x, hptr, e)\
 +  (__builtin_choose_expr(sizeof(*(hptr)) == 1, stb_p,   \
 +   __builtin_choose_expr(sizeof(*(hptr)) == 2, stw_##e##_p, \
 +   __builtin_choose_expr(sizeof(*(hptr)) == 4, stl_##e##_p, \
 +   __builtin_choose_expr(sizeof(*(hptr)) == 8, stq_##e##_p, abort   \
 + ((hptr), (x)), 0)
 +
 +#define __get_user_e(x, hptr, e)\
 +  ((x) =\
 +   __builtin_choose_expr(sizeof(*(hptr)) == 1, ldub_p,  \
 +   __builtin_choose_expr(sizeof(*(hptr)) == 2, lduw_##e##_p,\
 +   __builtin_choose_expr(sizeof(*(hptr)) == 4, ldl_##e##_p, \
 +   __builtin_choose_expr(sizeof(*(hptr)) == 8, ldq_##e##_p, abort   \
 + (hptr), 0)

For 8- and 16-bit quantities the load is explicitly unsigned
through the use of ldub and lduw.  But for 32-bit, ldl_[bl]e_p
return an int, so if x is a 64-bit variable sign-extension will
happen.  I'm not sure this is desirable, for instance when
using get_user_u32 which makes one think the result is an
unsigned 32-bit value.  Shouldn't ldul*_p functions be added
and used in __get_user_e?

Note I found this in private code, but wonder if some public
code isn't affected by this.

Thanks,

Laurent

 +
 +#ifdef TARGET_WORDS_BIGENDIAN
 +# define __put_user(x, hptr)  __put_user_e(x, hptr, be)
 +# define __get_user(x, hptr)  __get_user_e(x, hptr, be)
 +#else
 +# define __put_user(x, hptr)  __put_user_e(x, hptr, le)
 +# define __get_user(x, hptr)  __get_user_e(x, hptr, le)
 +#endif

  /* put_user()/get_user() take a guest address and check access */
  /* These are usually used to access an atomic data type, such as an int,
 --
 1.7.11.7





Re: [Qemu-devel] func call to safely shutdown VM and quit qemu?

2012-12-03 Thread Laurent Desnogues
On Mon, Dec 3, 2012 at 9:43 AM, Peter Cheung mcheun...@hotmail.com wrote:
 Hi
Is there a func call to safely shutdown VM and quit qemu?

I am using qemu_system_shutdown_request().  I don't know it that's
the best way of quitting, but it works for me.

HTH,

Laurent



Re: [Qemu-devel] [PATCH v2 1/3] tcg/arm: fix TLB access in qemu-ld/st ops

2012-10-31 Thread Laurent Desnogues
On Tue, Oct 30, 2012 at 1:18 AM,  y...@ohm.aurel32.net wrote:
 From: Aurelien Jarno aurel...@aurel32.net

 The TCG arm backend considers likely that the offset to the TLB
 entries does not exceed 12 bits for mem_index = 0. In practice this is
 not true for at least the MIPS target.

 The current patch fixes that by loading the bits 23-12 with a separate
 instruction, and using loads with address writeback, independently of
 the value of mem_idx. In total this allow a 24-bit offset, which is a
 lot more than needed.

 Cc: Andrzej Zaborowski balr...@gmail.com
 Cc: Peter Maydell peter.mayd...@linaro.org
 Cc: qemu-sta...@nongnu.org
 Signed-off-by: Aurelien Jarno aurel...@aurel32.net
 ---
  tcg/arm/tcg-target.c |   77 
 +++---
  1 file changed, 41 insertions(+), 36 deletions(-)

 diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
 index e790bf0..03b1576 100644
 --- a/tcg/arm/tcg-target.c
 +++ b/tcg/arm/tcg-target.c
 @@ -639,6 +639,22 @@ static inline void tcg_out_ld32_12(TCGContext *s, int 
 cond,
  (rn  16) | (rd  12) | ((-im)  0xfff));
  }

 +/* Offset pre-increment with base writeback.  */
 +static inline void tcg_out_ld32_12wb(TCGContext *s, int cond,
 + int rd, int rn, tcg_target_long im)
 +{
 +/* ldr with writeback and both register equals is UNPREDICTABLE */
 +assert(rd != rn);
 +
 +if (im = 0) {
 +tcg_out32(s, (cond  28) | 0x05b0 |
 +(rn  16) | (rd  12) | (im  0xfff));
 +} else {
 +tcg_out32(s, (cond  28) | 0x0530 |
 +(rn  16) | (rd  12) | ((-im)  0xfff));
 +}
 +}
 +
  static inline void tcg_out_st32_12(TCGContext *s, int cond,
  int rd, int rn, tcg_target_long im)
  {
 @@ -1071,7 +1087,7 @@ static inline void tcg_out_qemu_ld(TCGContext *s, const 
 TCGArg *args, int opc)
  {
  int addr_reg, data_reg, data_reg2, bswap;
  #ifdef CONFIG_SOFTMMU
 -int mem_index, s_bits;
 +int mem_index, s_bits, tlb_offset;
  TCGReg argreg;
  # if TARGET_LONG_BITS == 64
  int addr_reg2;
 @@ -,19 +1127,15 @@ static inline void tcg_out_qemu_ld(TCGContext *s, 
 const TCGArg *args, int opc)
  TCG_REG_R0, TCG_REG_R8, CPU_TLB_SIZE - 1);
  tcg_out_dat_reg(s, COND_AL, ARITH_ADD, TCG_REG_R0, TCG_AREG0,
  TCG_REG_R0, SHIFT_IMM_LSL(CPU_TLB_ENTRY_BITS));
 -/* In the
 - *  ldr r1 [r0, #(offsetof(CPUArchState, 
 tlb_table[mem_index][0].addr_read))]
 - * below, the offset is likely to exceed 12 bits if mem_index != 0 and
 - * not exceed otherwise, so use an
 - *  add r0, r0, #(mem_index * sizeof *CPUArchState.tlb_table)
 - * before.
 - */
 -if (mem_index)
 +/* We assume that the offset is contained within 24 bits.  */
 +tlb_offset = offsetof(CPUArchState, tlb_table[mem_index][0].addr_read);
 +assert(tlb_offset  ~0xff == 0);
 +if (tlb_offset  0xfff) {
  tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_R0, TCG_REG_R0,
 -(mem_index  (TLB_SHIFT  1)) |
 -((16 - (TLB_SHIFT  1))  8));
 -tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R0,
 -offsetof(CPUArchState, tlb_table[0][0].addr_read));
 +0xa00 | (tlb_offset  12));

Isn't it 20 bits rather than 24 bits since the immediate is 8-bit right-rotated
by 20?


Laurent

 +tlb_offset = 0xfff;
 +}
 +tcg_out_ld32_12wb(s, COND_AL, TCG_REG_R1, TCG_REG_R0, tlb_offset);
  tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R1,
  TCG_REG_R8, SHIFT_IMM_LSL(TARGET_PAGE_BITS));
  /* Check alignment.  */
 @@ -1131,15 +1143,14 @@ static inline void tcg_out_qemu_ld(TCGContext *s, 
 const TCGArg *args, int opc)
  tcg_out_dat_imm(s, COND_EQ, ARITH_TST,
  0, addr_reg, (1  s_bits) - 1);
  #  if TARGET_LONG_BITS == 64
 -/* XXX: possibly we could use a block data load or writeback in
 - * the first access.  */
 -tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R0,
 -offsetof(CPUArchState, tlb_table[0][0].addr_read) + 4);
 +/* XXX: possibly we could use a block data load in the first access.  */
 +tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R0, 4);
  tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0,
  TCG_REG_R1, addr_reg2, SHIFT_IMM_LSL(0));
  #  endif
  tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R0,
 -offsetof(CPUArchState, tlb_table[0][0].addend));
 +offsetof(CPUTLBEntry, addend)
 +- offsetof(CPUTLBEntry, addr_read));

  switch (opc) {
  case 0:
 @@ -1288,7 +1299,7 @@ static inline void tcg_out_qemu_st(TCGContext *s, const 
 TCGArg *args, int opc)
  {
  int addr_reg, data_reg, data_reg2, bswap;
  #ifdef CONFIG_SOFTMMU
 -int mem_index, s_bits;
 +int mem_index, s_bits, 

Re: [Qemu-devel] Singlestepping Target assembly instructions

2012-10-16 Thread Laurent Desnogues
On Tue, Oct 16, 2012 at 9:07 PM, Richard Henderson r...@twiddle.net wrote:
 On 2012-10-16 20:49, Emmanuel Blot wrote:
 Is there any way to force QEmu to disable the TB cache so that the
 translation occurs each time a target instruction is loaded, or a
 clever way to print out the address of each executed target instruction ?

 -d exec prints the entry point of each TB as it is executed.  You can
 refer to the previously disassembled insns by reference.

If I remember correctly, that only works if block chaining is disabled.


Laurent



Re: [Qemu-devel] [PATCH 1/5] tcg/arm: fix TLB access in qemu-ld/st ops

2012-10-10 Thread Laurent Desnogues
On Tue, Oct 9, 2012 at 11:13 PM, Peter Maydell peter.mayd...@linaro.org wrote:
 On 9 October 2012 21:30, Aurelien Jarno aurel...@aurel32.net wrote:
 The TCG arm backend considers likely that the offset to the TLB
 entries does not exceed 12 bits for mem_index = 0. In practice this is
 not true for at list the MIPS target.

 The current patch fixes that by loading the bits 23-12 with a separate
 instruction, and using loads with address writeback, independently of
 the value of mem_idx. In total this allow a 24-bit offset, which is a
 lot more than needed.

 Would probably be good to assert() that the bits above 24 are zero,
 though.

 Cc: Andrzej Zaborowski balr...@gmail.com
 Cc: Peter Maydell peter.mayd...@linaro.org
 Cc: qemu-sta...@nongnu.org
 Signed-off-by: Aurelien Jarno aurel...@aurel32.net
 ---
  tcg/arm/tcg-target.c |   73 
 +-
  1 file changed, 37 insertions(+), 36 deletions(-)

 diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
 index 737200e..6cde512 100644
 --- a/tcg/arm/tcg-target.c
 +++ b/tcg/arm/tcg-target.c
 @@ -624,6 +624,19 @@ static inline void tcg_out_ld32_12(TCGContext *s, int 
 cond,
  (rn  16) | (rd  12) | ((-im)  0xfff));
  }

 +/* Offset pre-increment with base writeback.  */
 +static inline void tcg_out_ld32_12wb(TCGContext *s, int cond,
 + int rd, int rn, tcg_target_long im)
 +{

 Worth asserting that rd != rn ? (that's an UNPREDICTABLE case
 for ldr with writeback)

 +if (im = 0) {
 +tcg_out32(s, (cond  28) | 0x05b0 |
 +   (rn  16) | (rd  12) | (im  0xfff));
 +} else {
 +tcg_out32(s, (cond  28) | 0x0530 |
 +   (rn  16) | (rd  12) | ((-im)  0xfff));
 +}

 you could avoid the if() by just setting the U bit using (im = 0)  23
 Clearer? Dunno.

You also have to negate the im value so it's not enough to just
change the opcode.


Laurent

 Looks OK otherwise (though I haven't tested it.)

 Random aside: we emit a pretty long string of COND_EQ predicated
 code in these functions, especially in the 64 bit address and
 byteswapped cases. It might be interesting to see if performance
 on an A9, say, was any better if we just did a conditional branch
 over it instead... Probably borderline to no-effect, though.

 -- PMM




Re: [Qemu-devel] [PATCH 5/5] tcg/arm: improve direct jump

2012-10-10 Thread Laurent Desnogues
On Tue, Oct 9, 2012 at 10:30 PM, Aurelien Jarno aurel...@aurel32.net wrote:
 Use ldr pc, [pc, #-4] kind of branch for direct jump. This removes the
 need to flush the icache on TB linking, and allow to remove the limit
 on the code generation buffer.

I'm not sure I like it.  In general having data in the middle
of code will increase I/D cache and I/D TLB pressure.

 This improves the boot-up speed of a MIPS guest by 11%.

Boot speed is very specific.  Did you test some other code?
Also what was your host?

Testing on a quad core Cortex-A9, using all of your patches
(including TCG optimizations), I get this running nbench i386
in user mode:

TEST: Iter/sec.  : Old Index  : New Index
:: Pentium 90 : AMD K6/233
:::---
NUMERIC SORT: 119.48 :   3.06 :   1.01
STRING SORT : 7.7907 :   3.48 :   0.54
BITFIELD: 2.2049e+07 :   3.78 :   0.79
FP EMULATION:  5.094 :   2.44 :   0.56
FOURIER : 483.73 :   0.55 :   0.31
ASSIGNMENT  :  1.778 :   6.77 :   1.75
IDEA: 341.43 :   5.22 :   1.55
HUFFMAN : 45.942 :   1.27 :   0.41
NEURAL NET  :0.16667 :   0.27 :   0.11
LU DECOMPOSITION:  5.969 :   0.31 :   0.22
===ORIGINAL BYTEMARK RESULTS==
INTEGER INDEX   : 3.319
FLOATING-POINT INDEX: 0.357
===LINUX DATA BELOW===
MEMORY INDEX: 0.907
INTEGER INDEX   : 0.774
FLOATING-POINT INDEX: 0.198

Not using this patch, I get:

TEST: Iter/sec.  : Old Index  : New Index
:: Pentium 90 : AMD K6/233
:::---
NUMERIC SORT: 121.88 :   3.13 :   1.03
STRING SORT : 7.8438 :   3.50 :   0.54
BITFIELD: 2.2597e+07 :   3.88 :   0.81
FP EMULATION: 5.1424 :   2.47 :   0.57
FOURIER : 466.04 :   0.53 :   0.30
ASSIGNMENT  :  1.809 :   6.88 :   1.79
IDEA: 359.28 :   5.50 :   1.63
HUFFMAN : 46.225 :   1.28 :   0.41
NEURAL NET  :0.16644 :   0.27 :   0.11
LU DECOMPOSITION:   5.77 :   0.30 :   0.22
===ORIGINAL BYTEMARK RESULTS==
INTEGER INDEX   : 3.384
FLOATING-POINT INDEX: 0.349
===LINUX DATA BELOW===
MEMORY INDEX: 0.922
INTEGER INDEX   : 0.790
FLOATING-POINT INDEX: 0.193

This patch doesn't bring any speedup in that case.

I guess we need more testing as a synthetic benchmark is as
specific as kernel booting :-)


Laurent

 Cc: Andrzej Zaborowski balr...@gmail.com
 Cc: Peter Maydell peter.mayd...@linaro.org
 Signed-off-by: Aurelien Jarno aurel...@aurel32.net
 ---
  exec-all.h   |   24 
  exec.c   |4 
  tcg/arm/tcg-target.c |7 +--
  3 files changed, 5 insertions(+), 30 deletions(-)

 diff --git a/exec-all.h b/exec-all.h
 index 6516da0..662b916 100644
 --- a/exec-all.h
 +++ b/exec-all.h
 @@ -224,26 +224,10 @@ static inline void tb_set_jmp_target1(uintptr_t 
 jmp_addr, uintptr_t addr)
  #elif defined(__arm__)
  static inline void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr)
  {
 -#if !QEMU_GNUC_PREREQ(4, 1)
 -register unsigned long _beg __asm (a1);
 -register unsigned long _end __asm (a2);
 -register unsigned long _flg __asm (a3);
 -#endif
 -
 -/* we could use a ldr pc, [pc, #-4] kind of branch and avoid the flush */
 -*(uint32_t *)jmp_addr =
 -(*(uint32_t *)jmp_addr  ~0xff)
 -| (((addr - (jmp_addr + 8))  2)  0xff);
 -
 -#if QEMU_GNUC_PREREQ(4, 1)
 -__builtin___clear_cache((char *) jmp_addr, (char *) jmp_addr + 4);
 -#else
 -/* flush icache */
 -_beg = jmp_addr;
 -_end = jmp_addr + 4;
 -_flg = 0;
 -__asm __volatile__ (swi 0x9f0002 : : r (_beg), r (_end), r 
 (_flg));
 -#endif
 +/* Patch the branch destination. It uses a ldr pc, [pc, #-4] kind
 +   of branch so we write absolute address and we don't need to
 +   flush icache. */
 +*(uint32_t *)jmp_addr = addr;
  }
  #elif defined(__sparc__)
  void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr);
 diff --git a/exec.c b/exec.c
 index 7899042..8d115ac 100644
 --- a/exec.c
 +++ b/exec.c
 @@ -546,10 +546,6 @@ static void code_gen_alloc(unsigned long tb_size)
  start = (void *) 0x4000UL;
  if (code_gen_buffer_size  (512 * 1024 * 1024))
  code_gen_buffer_size = (512 * 1024 * 1024);
 -#elif defined(__arm__)
 -/* Keep the buffer no bigger than 16MB to branch between blocks */
 -if (code_gen_buffer_size  16 * 1024 * 1024)
 -

Re: [Qemu-devel] [PATCH 5/5] tcg/arm: improve direct jump

2012-10-10 Thread Laurent Desnogues
On Wed, Oct 10, 2012 at 4:28 PM, Aurelien Jarno aurel...@aurel32.net wrote:
 On Wed, Oct 10, 2012 at 03:21:48PM +0200, Laurent Desnogues wrote:
 On Tue, Oct 9, 2012 at 10:30 PM, Aurelien Jarno aurel...@aurel32.net wrote:
  Use ldr pc, [pc, #-4] kind of branch for direct jump. This removes the
  need to flush the icache on TB linking, and allow to remove the limit
  on the code generation buffer.

 I'm not sure I like it.  In general having data in the middle
 of code will increase I/D cache and I/D TLB pressure.

 Agreed. On the other hand, this patch remove the synchronization of
 the instruction cache for TB linking/unlinking.

TB linking/unlinking should happen less often than code execution.

  This improves the boot-up speed of a MIPS guest by 11%.

 Boot speed is very specific.  Did you test some other code?
 Also what was your host?

 I tested it on a Cortex-A8 machine. I have only tested MIPS, but I can
 do more tests, like running the openssl testsuite in the emulated guest.

Yes, please.

[...]
 This doesn't really surprise me. The goal of the patch is to remove the
 limit of 16MB for the generated code. I really doubt you reach such a
 limit in user mode unless you use some complex code.

 On the other hand in system mode, this can be already reached once the
 whole guest kernel is translated, so cached code is dropped and has to
 be re-translated regularly. Re-translating guest code is clearly more
 expensive than the increase of I/D cache and I/D TLB pressure.

Ha yes, that's a real problem.  What about having some define
and/or runtime flag to keep both caches sync and your ldr PC
change in QEMU?

 The other way to allow more than 16MB of generated code would be to
 disable direct jump on ARM. It adds one 32-bit constant loading + one
 memory load, but then you don't have the I/D cache and TLB issue.

The performance hit would be even worse :-)


Laurent



Re: [Qemu-devel] [PATCH 0/2] make guest-base support mandatory for TCG backends

2012-10-09 Thread Laurent Desnogues
On Tue, Oct 9, 2012 at 3:28 PM, Richard Henderson r...@twiddle.net wrote:
 On 10/09/2012 06:16 AM, Peter Maydell wrote:
 That is, we could drop CONFIG_USER_GUEST_BASE. Does anybody have
 a practical use case for the --disable-guest-base configuration?

 Nope, because the backends are good at eliminating the overhead
 during tcg_target_qemu_prologue if guest_base turns out to be zero.

I had some difficulties getting the same code generated with
--enable-guest-base as the one generated with --disable-guest-base,
for ARM on x86_64.  One has to play with -B 0 and -R, and that was
not obvious, while --disable-guest-base was obvious for me :-).

Guest ARM, host x86_64.

$ ./arm-linux-user/qemu-arm -singlestep -d in_asm,out_asm
/work/qemu/Tests/arm/gcc-a9-O3 x

IN:
0x8138:  e59fc010  ldr  ip, [pc, #16]   ; 0x8150

OUT: [size=53]
0x602296c0:  mov$0x8150,%ebp
0x602296c5:  mov$0x2ac1f9b61000,%rdi
0x602296cf:  add%rbp,%rdi
0x602296d2:  mov(%rdi),%ebp
0x602296d4:  mov%ebp,0x30(%r14)
0x602296d8:  jmpq   0x602296dd
0x602296dd:  mov$0x813c,%ebp
0x602296e2:  mov%ebp,0x3c(%r14)
0x602296e6:  mov$0x2ac1f6760281,%rax
0x602296f0:  jmpq   0x6226d6b6

Running with -B 0 and playing with -R values until the program
succeeds in loading finally produces the same code as when
compiling QEMU with --disable-guest-base

OUT: [size=41]
0x602281a0:  mov$0x8150,%ebp
0x602281a5:  mov0x0(%rbp),%ebp
0x602281a8:  mov%ebp,0x30(%r14)
0x602281ac:  jmpq   0x602281b1
0x602281b1:  mov$0x813c,%ebp
0x602281b6:  mov%ebp,0x3c(%r14)
0x602281ba:  mov$0x2b9b26096281,%rax
0x602281c4:  jmpq   0x6226c1b6

I know the reason why reserved_va was set to the current value
(cf 288e65b9eea0c9b3cbe21be46f3e24e4e8b2a090), but I
thought it was interesting to point out the above.


Laurent



Re: [Qemu-devel] [PATCH] configure: Allow builds without any system or user emulation

2012-09-24 Thread Laurent Desnogues
On Sat, Sep 22, 2012 at 11:37 PM, Stefan Weil s...@weilnetz.de wrote:
 Am 14.09.2012 19:02, schrieb Stefan Weil:

 The old code aborted configure when no emulation target was selected.
 Even after removing the 'exit 1', it tried to read from STDIN
 when QEMU was configured with

  configure' '--disable-user' '--disable-system'

 This is fixed here.

 Signed-off-by: Stefan Weils...@weilnetz.de
 ---

 This patch can be applied after 66d5499b3 was reverted.

 It also works on top of 66d5499b3. In this case only Makefile
 needs modifications, and the configure part of the patch must be removed.

 Regards

 Stefan Weil


   Makefile  |5 +
   configure |4 
   2 files changed, 5 insertions(+), 4 deletions(-)

 diff --git a/Makefile b/Makefile
 index 9523e05..d38ac0f 100644
 --- a/Makefile
 +++ b/Makefile
 @@ -52,8 +52,13 @@ SUBDIR_MAKEFLAGS=$(if $(V),,--no-print-directory)
 BUILD_DIR=$(BUILD_DIR)
   SUBDIR_DEVICES_MAK=$(patsubst %, %/config-devices.mak, $(TARGET_DIRS))
   SUBDIR_DEVICES_MAK_DEP=$(patsubst %, %/config-devices.mak.d,
 $(TARGET_DIRS))

 +ifeq ($(SUBDIR_DEVICES_MAK),)
 +config-all-devices.mak:
 +   $(call quiet-command,echo '# no devices'  $@,  GEN   $@)
 +else
   config-all-devices.mak: $(SUBDIR_DEVICES_MAK)
 $(call quiet-command,cat $(SUBDIR_DEVICES_MAK) | grep =y | sort
 -u  $@,  GEN   $@)
 +endif

   -include $(SUBDIR_DEVICES_MAK_DEP)

 diff --git a/configure b/configure
 index fc27bd9..a9305f3 100755
 --- a/configure
 +++ b/configure
 @@ -1331,10 +1331,6 @@ if test -z $target_list ; then
   else
   target_list=`echo $target_list | sed -e 's/,/ /g'`
   fi
 -if test -z $target_list ; then
 -echo No targets enabled
 -exit 1
 -fi
   # see if system emulation was really requested
   case  $target_list  in
 *-softmmu *) softmmu=yes




 Ping? 66d5499b3 was reverted, so my patch can be applied if nobody objects.

Works fine here.

I think Daniel's original proposal to use some keyword for target-list
is more convenient than having to use both --disable-user and
--disable-system, but that's no big deal :-)


Laurent



Re: [Qemu-devel] [PATCH 0/9] tcg/optimize: rework copy propagation

2012-09-21 Thread Laurent Desnogues
On Wed, Sep 19, 2012 at 10:00 PM, Aurelien Jarno aurel...@aurel32.net wrote:
 This patch series rework the copy propagation in order to generate
 better code. The first two patches are cleanup and bug fixes, the third
 patch is the heart of the series, and the remaining ones are small
 optimizations using the new copy propagation.

 I have measured a decrease of the generated code size of about 4%, with
 a gain in speed between 0 and 2% depending on the workload.

 For better benefits in ARM emulation, it should be used with the setcond
 patches series I have sent a few days ago.

Using this patch series along with the ARM one, I got a nice gain of
speed in user mode for SPECCPU 2k gcc:  without the patches the
integrate.i data set runs in 13.6s, with them the time is reduced to 12.4s
(host CPU is i5 2400 64-bit mode). Nice job!


Laurent

 Aurelien Jarno (9):
   tcg/optimizer: remove TCG_TEMP_ANY
   tcg/optimizer: check types in copy propagation
   tcg/optimizer: rework copy progagation
   tcg/optimize: do copy propagation for all operations
   tcg/optimize: optimize op r, a, a = mov r, a
   tcg/optimize: optimize op r, a, a = movi r, 0
   tcg/optimize: further optimize brcond/setcond
   tcg/optimize: prefer the op a, a, b form for commutative ops
   tcg: remove #ifdef #endif around TCGOpcode tests

  tcg/optimize.c |  326 
 ++--
  tcg/tcg.c  |   16 +--
  2 files changed, 200 insertions(+), 142 deletions(-)

 --
 1.7.10.4





Re: [Qemu-devel] [PATCH 3/5] target-arm: convert shl and shr helpers to TCG

2012-09-17 Thread Laurent Desnogues
On Mon, Sep 17, 2012 at 1:08 AM, Aurelien Jarno aurel...@aurel32.net wrote:
 Now that the setcond TCG op is available, it's possible to replace
 shl and shr helpers by TCG code. The code generated by TCG is slightly
 longer than the code generated by GCC for the helper but is still worth
 it as this avoid all the consequences of using an helper: globals saved
 back to memory, no possible optimization, call overhead, etc.

 Cc: Peter Maydell peter.mayd...@linaro.org
 Signed-off-by: Aurelien Jarno aurel...@aurel32.net
 ---
  target-arm/helper.h|2 --
  target-arm/op_helper.c |   16 
  target-arm/translate.c |   32 
  3 files changed, 28 insertions(+), 22 deletions(-)

 diff --git a/target-arm/helper.h b/target-arm/helper.h
 index 7151e28..b123d3e 100644
 --- a/target-arm/helper.h
 +++ b/target-arm/helper.h
 @@ -145,8 +145,6 @@ DEF_HELPER_5(neon_tbl, i32, env, i32, i32, i32, i32)
  DEF_HELPER_3(adc_cc, i32, env, i32, i32)
  DEF_HELPER_3(sbc_cc, i32, env, i32, i32)

 -DEF_HELPER_3(shl, i32, env, i32, i32)
 -DEF_HELPER_3(shr, i32, env, i32, i32)
  DEF_HELPER_3(sar, i32, env, i32, i32)
  DEF_HELPER_3(shl_cc, i32, env, i32, i32)
  DEF_HELPER_3(shr_cc, i32, env, i32, i32)
 diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
 index 6095f24..5fcd12c 100644
 --- a/target-arm/op_helper.c
 +++ b/target-arm/op_helper.c
 @@ -355,22 +355,6 @@ uint32_t HELPER(sbc_cc)(CPUARMState *env, uint32_t a, 
 uint32_t b)

  /* Similarly for variable shift instructions.  */

 -uint32_t HELPER(shl)(CPUARMState *env, uint32_t x, uint32_t i)
 -{
 -int shift = i  0xff;
 -if (shift = 32)
 -return 0;
 -return x  shift;
 -}
 -
 -uint32_t HELPER(shr)(CPUARMState *env, uint32_t x, uint32_t i)
 -{
 -int shift = i  0xff;
 -if (shift = 32)
 -return 0;
 -return (uint32_t)x  shift;
 -}
 -
  uint32_t HELPER(sar)(CPUARMState *env, uint32_t x, uint32_t i)
  {
  int shift = i  0xff;
 diff --git a/target-arm/translate.c b/target-arm/translate.c
 index 19bb1e8..9c29065 100644
 --- a/target-arm/translate.c
 +++ b/target-arm/translate.c
 @@ -440,6 +440,26 @@ static void gen_sub_CC(TCGv dest, TCGv t0, TCGv t1)
  tcg_gen_mov_i32(dest, cpu_NF);
  }

 +#define GEN_SHIFT(name)\
 +static void gen_##name(TCGv dest, TCGv t0, TCGv t1)\
 +{  \
 +TCGv tmp1, tmp2;   \
 +tmp1 = tcg_temp_new_i32(); \
 +tcg_gen_andi_i32(tmp1, t1, 0xff);  \
 +tmp2 = tcg_temp_new_i32(); \
 +tcg_gen_setcondi_i32(TCG_COND_GE, tmp2, tmp1, 32); \
 +tcg_gen_andi_i32(tmp1, tmp1, 0x1f);\

I don't think the 'and 0x1f' is needed given that later you'll and
with 0 if the shift amount is = 32.

 +tcg_gen_##name##_i32(dest, t0, tmp1);  \
 +tcg_temp_free_i32(tmp1);   \
 +tcg_gen_subi_i32(tmp2, tmp2, 1);   \
 +tcg_gen_and_i32(dest, dest, tmp2); \
 +tcg_temp_free_i32(tmp2);   \
 +}


Laurent

 +GEN_SHIFT(shl)
 +GEN_SHIFT(shr)
 +#undef GEN_SHIFT
 +
 +
  /* FIXME:  Implement this natively.  */
  #define tcg_gen_abs_i32(t0, t1) gen_helper_abs(t0, t1)

 @@ -516,8 +536,12 @@ static inline void gen_arm_shift_reg(TCGv var, int 
 shiftop,
  }
  } else {
  switch (shiftop) {
 -case 0: gen_helper_shl(var, cpu_env, var, shift); break;
 -case 1: gen_helper_shr(var, cpu_env, var, shift); break;
 +case 0:
 +gen_shl(var, var, shift);
 +break;
 +case 1:
 +gen_shr(var, var, shift);
 +break;
  case 2: gen_helper_sar(var, cpu_env, var, shift); break;
  case 3: tcg_gen_andi_i32(shift, shift, 0x1f);
  tcg_gen_rotr_i32(var, var, shift); break;
 @@ -9161,7 +9185,7 @@ static void disas_thumb_insn(CPUARMState *env, 
 DisasContext *s)
  break;
  case 0x2: /* lsl */
  if (s-condexec_mask) {
 -gen_helper_shl(tmp2, cpu_env, tmp2, tmp);
 +gen_shl(tmp2, tmp2, tmp);
  } else {
  gen_helper_shl_cc(tmp2, cpu_env, tmp2, tmp);
  gen_logic_CC(tmp2);
 @@ -9169,7 +9193,7 @@ static void disas_thumb_insn(CPUARMState *env, 
 DisasContext *s)
  break;
  case 0x3: /* lsr */
  if (s-condexec_mask) {
 -gen_helper_shr(tmp2, cpu_env, tmp2, tmp);
 +gen_shr(tmp2, tmp2, tmp);
  } else {
  gen_helper_shr_cc(tmp2, cpu_env, tmp2, tmp);
  gen_logic_CC(tmp2);
 --
 1.7.10.4





Re: [Qemu-devel] [PATCH 3/5] target-arm: convert shl and shr helpers to TCG

2012-09-17 Thread Laurent Desnogues
On Mon, Sep 17, 2012 at 11:43 AM, Peter Maydell
peter.mayd...@linaro.org wrote:
 On 17 September 2012 10:30, Laurent Desnogues
 laurent.desnog...@gmail.com wrote:
 On Mon, Sep 17, 2012 at 1:08 AM, Aurelien Jarno aurel...@aurel32.net wrote:
 +#define GEN_SHIFT(name)\
 +static void gen_##name(TCGv dest, TCGv t0, TCGv t1)\
 +{  \
 +TCGv tmp1, tmp2;   \
 +tmp1 = tcg_temp_new_i32(); \
 +tcg_gen_andi_i32(tmp1, t1, 0xff);  \
 +tmp2 = tcg_temp_new_i32(); \
 +tcg_gen_setcondi_i32(TCG_COND_GE, tmp2, tmp1, 32); \
 +tcg_gen_andi_i32(tmp1, tmp1, 0x1f);\

 I don't think the 'and 0x1f' is needed given that later you'll and
 with 0 if the shift amount is = 32.

 The TCG shift operations are undefined behaviour (not merely
 undefined result) if the shift is = 32, so we must avoid
 doing that even if we're going to throw away the answer.

That's odd that it doesn't just state that the result is undefined.
I wonder what undefined behavior means.  I understand
what undefined behavior (as opposed tu undefined result)
means for divisions by 0, but not for a shift larger than data
type width.

Anyway that makes my comment about removing the  0x1f
pointless.


Laurent



Re: [Qemu-devel] [PATCH v2] Add ability to disable build of all targets

2012-09-12 Thread Laurent Desnogues
On Tue, Sep 11, 2012 at 8:56 PM, Eduardo Habkost ehabk...@redhat.com wrote:
 On Mon, Sep 10, 2012 at 06:00:54PM -0500, Anthony Liguori wrote:
 Daniel P. Berrange berra...@redhat.com writes:

  From: Daniel P. Berrange berra...@redhat.com
 
  Allow passing of '--target-list=' to configure to request that
  all targets are to be disabled. This allows for doing a very
  fast tools-only build of things like qemu-img, qemu-io, qemu-nbd.
 
  Signed-off-by: Daniel P. Berrange berra...@redhat.com

 Applied. Thanks.

 This patch broke the --target-list option:

   $ ./configure --target-list=x86_64-linux-user,x86_64-softmmu
   [...]
   ERROR: Target 'x86_64-linux-user,x86_64-softmmu' not recognised
   $

It also seems to not even allow to use --target-list=.  On my setup
(CentOS 5.6, building in the git directory) the build process just
hangs here:

$ make V=1
cat  | grep =y | sort -u  config-all-devices.mak


Laurent


 Regards,

 Anthony Liguori

  ---
   configure | 13 -
   1 file changed, 4 insertions(+), 9 deletions(-)
 
  diff --git a/configure b/configure
  index 75dc9da..472374e 100755
  --- a/configure
  +++ b/configure
  @@ -127,7 +127,7 @@ cc_i386=i386-pc-linux-gnu-gcc
   libs_qga=
   debug_info=yes
 
  -target_list=
  +target_list=DEFAULT
 
   # Default value for a variable defining feature foo.
   #  * foo=no  feature will only be used if --enable-foo arg is given
  @@ -1319,15 +1319,10 @@ if ! $python -c 'import sys; 
  sys.exit(sys.version_info  (2,4) or sys.version_
 exit 1
   fi
 
  -if test -z $target_list ; then
  -target_list=$default_target_list
  -else
  -target_list=`echo $target_list | sed -e 's/,/ /g'`
  -fi
  -if test -z $target_list ; then
  -echo No targets enabled
  -exit 1
  +if test $target_list = DEFAULT ; then
  +target_list=`echo $default_target_list | sed -e 's/,/ /g'`
   fi
  +
   # see if system emulation was really requested
   case  $target_list  in
 *-softmmu *) softmmu=yes
  --
  1.7.11.2


 --
 Eduardo




Re: [Qemu-devel] [PATCH] configure: fix --target-list=target, target, ... option

2012-09-12 Thread Laurent Desnogues
Sorry, I had missed this patch...

On Tue, Sep 11, 2012 at 9:02 PM, Eduardo Habkost ehabk...@redhat.com wrote:
 commit 66d5499b3754b83c09487259c08fe2ce73188a59 broke the support for
 comma-separated target lists on the --target-list option. e.g.:

   $ ./configure --target-list=x86_64-linux-user,x86_64-softmmu
   [...]
   ERROR: Target 'x86_64-linux-user,x86_64-softmmu' not recognised
   $

 This patch restores that ability.

 Signed-off-by: Eduardo Habkost ehabk...@redhat.com
 Cc: Daniel P. Berrange berra...@redhat.com
 Cc: Anthony Liguori anth...@codemonkey.ws
 ---
  configure | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)

 diff --git a/configure b/configure
 index 7656c32..9ee7038 100755
 --- a/configure
 +++ b/configure
 @@ -1323,7 +1323,9 @@ if ! $python -c 'import sys; 
 sys.exit(sys.version_info  (2,4) or sys.version_
  fi

  if test $target_list = DEFAULT ; then
 -target_list=`echo $default_target_list | sed -e 's/,/ /g'`
 +target_list=$default_target_list
 +else
 +target_list=`echo $target_list | sed -e 's/,/ /g'`
  fi

This works for me too.

But I still can't get what the original patch posted by
Daniel Berrange intended to do:

$ ./configure --target-list=
$ make V=1
cat  | grep =y | sort -u  config-all-devices.mak

And it of course hangs there.

Creating an empty config-all-devices.mak before running
make solves the issue.


Laurent

  # see if system emulation was really requested
 --
 1.7.11.4





Re: [Qemu-devel] How does ARM VFP is emulated?

2012-08-17 Thread Laurent Desnogues
On Thursday, August 16, 2012, Oi Khote oikh...@hotmail.com wrote:
 So how exactly does VFP is being emulated.

QEMU uses a library for FP computations, based on the softfloat package.


Laurent


Re: [Qemu-devel] qemu log function to print out the registers of the guest

2012-08-16 Thread Laurent Desnogues
On Thu, Aug 16, 2012 at 7:49 PM, Steven wangwangk...@gmail.com wrote:
[...]
 I want to get the guest memory address in the instruction mov
 0x4(%ebx)  %eax, whic is 0x4(%ebx).
 Since %ebx is not resolved until the execution time, the code in
 softmmu_header.h does not generate any hit or miss information.
 Do you know any place that I could resolve the memory access address? Thanks.

You'll have to generate code.  Look at how helpers work.


Laurent



Re: [Qemu-devel] Is the return address of get_page_addr_code guest physical address?

2012-08-11 Thread Laurent Desnogues
On Sat, Aug 11, 2012 at 5:41 AM, Steven wangwangk...@gmail.com wrote:
[...]
 I want to get the guest physical address of a pc. I note the part of
 the function cpu_x86_handle_mmu_fault will do something like page
 walking to convert a pc to its guest physical address. I think this is
 the guest physical address I need. However, there is no other function
 available to do this page walking.
 So I am thinking add a function to do the conversion.

If you want the translation of any guest virtual address to guest physical
address then cpu_x86_handle_mmu_fault is close to what you want.  Of
course you'd have to rewrite it as you probably don't want your function
to change the CPU env.

Perhaps cpu_get_phys_page_debug is even closer to what you need,
but I'm not familiar enough with x86 to say for sure.


Laurent



Re: [Qemu-devel] Smc support in qemu

2012-08-03 Thread Laurent Desnogues
On Thu, Aug 2, 2012 at 2:20 PM, Itaru Kitayama kitay...@cl.bb4u.ne.jp wrote:
 The recent upstream highbank kernel uses smc to enable its L2 cache,
 but on a qemu virt machine
 it is not supported yet. Is it likely supported by qemu soon? What is
 the time frame for that?

For other people reading this, SMC is the instruction used by ARM
processors to call routines written in the Secure world.  The issue
here is that even if you had support for that instruction, it wouldn't
be enough because each platform has different SMC
implementations, which code of course is not public, and quite
often even the API is undocumented.

Riku Voipio has a branch which implements some basic security
stuff here:

http://git.linaro.org/gitweb?p=people/rikuvoipio/qemu.git;a=shortlog;h=refs/heads/linaro

(That branch is old, but it can help to understand the basic
stuff needed.)

In helper.c:

 case EXCP_SMC:
 if (semihosting_enabled) {
 cpu_abort(env, SMC handling under semihosting not implemented\n);
 return;
 }
 if ((env-uncached_cpsr  CPSR_M) == ARM_CPU_MODE_SMC) {
 env-cp15.c1_secfg = ~1;
 }
 offset = env-thumb ? 2 : 0;
 new_mode = ARM_CPU_MODE_SMC;
 addr = 0x08;
 mask = CPSR_A | CPSR_I | CPSR_F;
 break;

If you know the SMC API, you could perhaps plug some code
to run when semihosting_enabled is set (though that might
conflict with the other uses of that flag, so one could perhaps
use some other flag that would say if we simulate SMC code
for OMAP, or Calxeda, or Tegra 3, etc.).

 If support can not be expected any time soon, how do we work around
 the issue? I have been using
 qemu-linaro-1.1.50-2012.07.

I'm afraid the easiest way is to patch the kernel.


Laurent



Re: [Qemu-devel] Smc support in qemu

2012-08-03 Thread Laurent Desnogues
On Fri, Aug 3, 2012 at 10:37 AM, Peter Maydell peter.mayd...@linaro.org wrote:
[...]
 http://git.linaro.org/gitweb?p=people/rikuvoipio/qemu.git;a=shortlog;h=refs/heads/linaro

 That's just a personal (and as you say old) copy of qemu-linaro.
 qemu-linaro proper is here:
 http://git.linaro.org/gitweb?p=qemu/qemu-linaro.git;a=shortlog;h=refs/heads/rebasing

I should have known better, thanks :-)

 My proposal for how we should handle SMC is here:
 http://lists.gnu.org/archive/html/qemu-devel/2012-05/msg03012.html

 (the code in qemu-linaro for SMC is similar to that but not
 exactly the same semantics).

If I correctly understood your proposal, it would require some
ARM code to simulate the behaviour of the secure ROM,
right?  IMHO that'd be much more complex than what is
needed to solve Itaru Kitayama issue.  In fact, I wonder if
considering all SMC calls as NOPs wouldn't be enough to
run non secure kernels using that feature (of course, one
would still need some of the secure cp15 registers, but
that's not a very difficult task I guess).


Laurent



Re: [Qemu-devel] Adding a parameter to a helper

2012-07-31 Thread Laurent Desnogues
On Mon, Jul 30, 2012 at 6:40 PM, Jose Cano Reyes jc...@ac.upc.edu wrote:
 I am trying to add a new integer parameter to an existing helper and call
 this helper in targeti386/translate.c. I have several problems:

 1) I cannot add an integer parameter to the helper, the compiler says that
 it must be TCGv_i32, despite I declare this new parameter as int in
 target-i386/helper.h. Why?

Helpers only accept TCGv parameters.

 2) If I use the the function tcg_const_i32 in order to convert my integer
 to TCGv_i32 I always obtain the same output value, that is:

 tcg_const_i32(10) = 1074260520
 tcg_const_i32(22) = 1074260520
 tcg_const_i32(30) = 1074260520
 ...

TCGv is an index, not the value it represents.  Think of it
as an id.

tcg_const will allocate a TCGv and then emit a TCG mov
instruction to assign it a value.

 3) Moreover, wen I pass this value in the helper call gen_helper_flds_ST0,
 that is:

  gen_helper_flds_ST0(cpu_tmp2_i32, tcg_const_i32(MY_INT_VALUE));

 How can I use MY_INT_VALUE later in the function tcg_gen_helperN .
 This function is called by DEF_HELPER_FLAGS2, which corresponds to
 DEF_HELPER_2 (definition of my helper).

Look at helper_aam, that should help :-)


Laurent



Re: [Qemu-devel] Adding a parameter to a helper

2012-07-31 Thread Laurent Desnogues
On Tue, Jul 31, 2012 at 5:09 PM, Jose Cano Reyes jc...@ac.upc.edu wrote:
 - So, how ca I obtain the value that TCGv_i32 represents?

In the generated code (that is after TCG is translated to host
machine code), you'll et your value in your helper.  If you mean
before running the helper, then it's much more complex and
would require to process the TCG code.

 - I don't understand well how a helper functions. For instance, cosidering
 this call to a helper again:

 gen_helper_flds_ST0(cpu_tmp2_i32, tcg_const_i32(MY_INT_VALUE))

   Can I obtain the parameters cpu_tmp2_i32 and
 tcg_const_i32(MY_INT_VALUE) from the args[0] and args[1] described in
 DEF_HELPER_FLAGS_2???

cf. above.  Again look at existing helpers and how they get
const values.


Laurent



Re: [Qemu-devel] Using QEMU for gcc testing

2012-07-19 Thread Laurent Desnogues
On Thu, Jul 19, 2012 at 5:29 PM, Michael Eager ea...@eagerm.com wrote:
 I'm interested in using QEMU to test gcc for a processor.
 This is a hard-metal target -- there is no operating system.

 Can anyone make suggestions on how to do this?

You could look at how QEMU implements semihosting for ARM.
And also how gcc and associated libraries are using
semihosting for ARM.  You could then use something similar
for your processor.

HTH,

Laurent



Re: [Qemu-devel] [RFC][PATCH v2 4/4] configure: add CONFIG_QEMU_LDST_OPTIMIZATION for TCG qemu_ld/st optimization

2012-07-05 Thread Laurent Desnogues
On Thu, Jul 5, 2012 at 4:06 PM, Peter Maydell peter.mayd...@linaro.org wrote:
[...]
  case $target_arch2 in
alpha | sparc* | xtensa* | ppc*)
  echo CONFIG_TCG_PASS_AREG0=y  $config_target_mak
 +# qemu_ld/st optimization is not available with CONFIG_TCG_PASS_AREG0
 +target_ldst_optimization=no

 PASS_AREG0 is the way of the future -- you need to fix the ldst
 optimization to work with it.

Agreed.  But what's the point of speeding up on one side and
losing speed on the other side?  AREG0 slowdown would be
acceptable if the ARM target was using less helpers, until this
happens I don't think it is a good idea to push AREG0,
hence I'm not sure it is a prerequisite that Yeongkyoon Lee's
patch supports it.  By the way, it's also a good reason to have
CONFIG_QEMU_LDST_OPTIMIZATION, which you
commented on in patch 3.

Basically, and in my humble opinion, this is not a good
enough reason to reject the patch :-)  Of course the support
should be added as soon as possible once the rest has
been discussed.


Laurent



Re: [Qemu-devel] How to measure guest memory access (qemu_ld/qemu_st) time?

2012-06-15 Thread Laurent Desnogues
On Fri, Jun 15, 2012 at 12:30 AM, Lluís Vilanova vilan...@ac.upc.edu wrote:
[...]
 Now that I think of it, you will have problems generating code to surround 
 each
 qemu_ld/st with a lightweight mechanism to get the time. In x86 it would be
 rdtsc, but you want to generate a host rdtsc instruction inside the code
 generated by QEMU's TCG, so you should also have to hack TCG (or the code
 generation pointers) to issue an rdtsc instruction.

Even rdtsc would introduce enough noise that it wouldn't be reliable
for such a micro measurement:  as far as I understand it, this instruction
can be reordered, so you need to flush the pipeline before issuing it.

Intel has a document about that:
download.intel.com/embedded/software/IA/324264.pdf
The overhead of their proposed method is so high that it's likely it
would take longer than the execution of the fast path itself.

IMHO a mix of YeongKyoon Lee way to count ld/st and comparison
between user mode and softmmu still seems to be the best approach
(well unless you have access to a cycle accurate simulator :-).


Laurent



Re: [Qemu-devel] How to measure guest memory access (qemu_ld/qemu_st) time?

2012-06-13 Thread Laurent Desnogues
On Wed, Jun 13, 2012 at 5:14 AM, 陳韋任 (Wei-Ren Chen)
che...@iis.sinica.edu.tw wrote:
 Hi all,

  I suspect that guest memory access (qemu_ld/qemu_st) account for the major of
 time spent in system mode. I would like to know precisely how much (if 
 possible).
 We use tools like perf [1] before, but since the logic of guest memory access 
 aslo
 embedded in the host binary not only helper functions, the result cannot be
 relied. The current idea is adding helper functions before/after guest memory
 access logic. Take ARM guest on x86_64 host for example, should I add the 
 helper
 functions before/after tcg_gen_qemu_{ld,st} in target-arm/translate.c or
 tcg_out_qemu_{ld,st} in tcg/i386/tcg-target.c? Or there is a better way to 
 know
 how much time QEMU spend on handling guest memory access?

I'm afraid there's no easy way to measure that: any change you make
to generated code will completely change the timing given that the ld/st
fast path is only a few instructions long.

Another approach might be to run the program in user mode and then
in system mode (provided the guest OS is very light).

As a side note, it might be interesting to gather statistics about the hit
rate of the QEMU TLB.  Another thing to consider is speeding up the
fast path;  see YeongKyoon Lee RFC patch:

http://www.mail-archive.com/qemu-devel@nongnu.org/msg91294.html


Laurent



Re: [Qemu-devel] ARM: Virtual / Physical address translation

2012-06-04 Thread Laurent Desnogues
On Thu, May 31, 2012 at 9:07 PM, Ira Ray Jenkins
irarayjenk...@gmail.com wrote:
 On Wed, May 30, 2012 at 10:30 AM, Laurent Desnogues
 laurent.desnog...@gmail.com wrote:
 On Wed, May 30, 2012 at 3:20 AM, Peter Maydell peter.mayd...@linaro.org 
 wrote:
 On 30 May 2012 02:00, Ira Ray Jenkins irarayjenk...@gmail.com wrote:
 What I would like is to be able to get the physical addresses of both
 data and instructions. Can anyone help me work through how to get the
 properly translated physical addresses given the virtual address?

 See the function get_phys_addr() in target-arm/helper.c ... That is
 a private function but if you're doing a local hack you can wire
 it up to what you need it for.

 Using that function directly is not that great an idea as it has
 side effects on the environment.  IMHO the best is to duplicate
 it and remove the side effects (which is what I did for my
 cache simulator).

 BTW Edgar Iglesias has implemented a cache simulator in
 QEMU.  I just can't remember where the repository is...


 Laurent

 Would I also need to duplicate get_phys_addr_mpu/v5/v6 ?

Depending on the target CPU, you'd have to duplicate one of them.

 The side effects you mentioned, are these in the above functions?

I was wrong in my previous mail:  the changes to env are done
in cpu_arm_handle_mmu_fault which is the entry point to handle
VA/PA translation from generated code.

 Since I'm really only interested in the physical address - phys_ptr -
 , can I ignore protection  page size? What about access type and
 user?

You don't need to return these values.  But be careful about
translations that fault:  if you insert your helper call before the
code for the emulated ld/st, you can get faults and in this
case you probably don't want to log that memory access.

 Edgar's work was for the cris target, so I'm unsure if it is different
 for arm.

The way it's done should be similar.

 Basically, since I am just doing a memory trace dump for arm
 target, I just want a simple translation from virtual address to
 physical, unobtrusively - without modify the state/env. I'm not sure
 how to modify get_phys_addr*() to do this. Any help would be great.

I hope you have enough information now.


Laurent



Re: [Qemu-devel] ARM: Virtual / Physical address translation

2012-05-30 Thread Laurent Desnogues
On Wed, May 30, 2012 at 3:20 AM, Peter Maydell peter.mayd...@linaro.org wrote:
 On 30 May 2012 02:00, Ira Ray Jenkins irarayjenk...@gmail.com wrote:
 What I would like is to be able to get the physical addresses of both
 data and instructions. Can anyone help me work through how to get the
 properly translated physical addresses given the virtual address?

 See the function get_phys_addr() in target-arm/helper.c ... That is
 a private function but if you're doing a local hack you can wire
 it up to what you need it for.

Using that function directly is not that great an idea as it has
side effects on the environment.  IMHO the best is to duplicate
it and remove the side effects (which is what I did for my
cache simulator).

BTW Edgar Iglesias has implemented a cache simulator in
QEMU.  I just can't remember where the repository is...


Laurent



Re: [Qemu-devel] [PATCH 08/10] qapi: add strict mode to input visitor

2012-04-02 Thread Laurent Desnogues
Hello,

On Thu, Mar 22, 2012 at 12:51 PM, Paolo Bonzini pbonz...@redhat.com wrote:
 While QMP in general is designed so that it is possible to ignore
 unknown arguments, in the case of the QMP server it is better to
 reject them to detect bad clients.  In fact, we're already doing
 this at the top level in the argument checker.  To extend this to
 complex structures, add a mode to the input visitor where it checks
 for unvisited keys and raises an error if it finds one.

 Signed-off-by: Paolo Bonzini pbonz...@redhat.com
 ---
  qapi/qmp-input-visitor.c |   48 +-
  qapi/qmp-input-visitor.h |    2 +
  test-qmp-input-strict.c  |  234 
 ++
  tests/Makefile           |    5 +-
  4 files changed, 285 insertions(+), 4 deletions(-)
  create mode 100644 test-qmp-input-strict.c

 diff --git a/qapi/qmp-input-visitor.c b/qapi/qmp-input-visitor.c
 index 97a0186..eb09874 100644
 --- a/qapi/qmp-input-visitor.c
 +++ b/qapi/qmp-input-visitor.c
 @@ -24,6 +24,7 @@ typedef struct StackObject
  {
     QObject *obj;
     const QListEntry *entry;
 +    GHashTable *h;
  } StackObject;

  struct QmpInputVisitor
 @@ -31,6 +32,7 @@ struct QmpInputVisitor
     Visitor visitor;
     StackObject stack[QIV_STACK_SIZE];
     int nb_stack;
 +    bool strict;
  };

  static QmpInputVisitor *to_qiv(Visitor *v)
 @@ -45,6 +47,9 @@ static QObject *qmp_input_get_object(QmpInputVisitor *qiv,

     if (qobj) {
         if (name  qobject_type(qobj) == QTYPE_QDICT) {
 +            if (qiv-stack[qiv-nb_stack - 1].h) {
 +                g_hash_table_remove(qiv-stack[qiv-nb_stack - 1].h, name);
 +            }
             return qdict_get(qobject_to_qdict(qobj), name);
         } else if (qiv-stack[qiv-nb_stack - 1].entry) {
             return qlist_entry_obj(qiv-stack[qiv-nb_stack - 1].entry);
 @@ -54,20 +59,47 @@ static QObject *qmp_input_get_object(QmpInputVisitor *qiv,
     return qobj;
  }

 +static void qdict_add_key(const char *key, QObject *obj, void *opaque)
 +{
 +    GHashTable *h = opaque;
 +    g_hash_table_insert(h, (gpointer) key, NULL);
 +}
 +
  static void qmp_input_push(QmpInputVisitor *qiv, QObject *obj, Error **errp)
  {
 -    qiv-stack[qiv-nb_stack].obj = obj;
 -    qiv-stack[qiv-nb_stack].entry = NULL;
 -    qiv-nb_stack++;
 +    GHashTable *h;

     if (qiv-nb_stack = QIV_STACK_SIZE) {
         error_set(errp, QERR_BUFFER_OVERRUN);
         return;
     }
 +
 +    qiv-stack[qiv-nb_stack].obj = obj;
 +    qiv-stack[qiv-nb_stack].entry = NULL;
 +    qiv-stack[qiv-nb_stack].h = NULL;
 +
 +    if (qiv-strict  qobject_type(obj) == QTYPE_QDICT) {
 +        h = g_hash_table_new(g_str_hash, g_str_equal);
 +        qdict_iter(qobject_to_qdict(obj), qdict_add_key, h);
 +        qiv-stack[qiv-nb_stack].h = h;
 +    }
 +
 +    qiv-nb_stack++;
  }

  static void qmp_input_pop(QmpInputVisitor *qiv, Error **errp)
  {
 +    GHashTableIter iter;

GHashTableIter is alas not available in the glib (2.12) that
the distros we use at work run.  Is there a workaround for
this issue?

Thanks,

Laurent

 +    gpointer key;
 +
 +    if (qiv-strict  qiv-stack[qiv-nb_stack - 1].h) {
 +        g_hash_table_iter_init(iter, qiv-stack[qiv-nb_stack - 1].h);
 +        if (g_hash_table_iter_next(iter, key, NULL)) {
 +            error_set(errp, QERR_QMP_EXTRA_MEMBER, (char *) key);
 +        }
 +        g_hash_table_unref(qiv-stack[qiv-nb_stack - 1].h);
 +    }
 +
     assert(qiv-nb_stack  0);
     qiv-nb_stack--;
  }
 @@ -257,3 +289,13 @@ QmpInputVisitor *qmp_input_visitor_new(QObject *obj)

     return v;
  }
 +
 +QmpInputVisitor *qmp_input_visitor_new_strict(QObject *obj)
 +{
 +    QmpInputVisitor *v;
 +
 +    v = qmp_input_visitor_new(obj);
 +    v-strict = true;
 +
 +    return v;
 +}
 diff --git a/qapi/qmp-input-visitor.h b/qapi/qmp-input-visitor.h
 index 3f798f0..e0a48a5 100644
 --- a/qapi/qmp-input-visitor.h
 +++ b/qapi/qmp-input-visitor.h
 @@ -20,6 +20,8 @@
  typedef struct QmpInputVisitor QmpInputVisitor;

  QmpInputVisitor *qmp_input_visitor_new(QObject *obj);
 +QmpInputVisitor *qmp_input_visitor_new_strict(QObject *obj);
 +
  void qmp_input_visitor_cleanup(QmpInputVisitor *v);

  Visitor *qmp_input_get_visitor(QmpInputVisitor *v);
 diff --git a/test-qmp-input-strict.c b/test-qmp-input-strict.c
 new file mode 100644
 index 000..f6df8cb
 --- /dev/null
 +++ b/test-qmp-input-strict.c
 @@ -0,0 +1,234 @@
 +/*
 + * QMP Input Visitor unit-tests (strict mode).
 + *
 + * Copyright (C) 2011-2012 Red Hat Inc.
 + *
 + * Authors:
 + *  Luiz Capitulino lcapitul...@redhat.com
 + *  Paolo Bonzini pbonz...@redhat.com
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +
 +#include glib.h
 +#include stdarg.h
 +
 +#include qapi/qmp-input-visitor.h
 +#include test-qapi-types.h
 +#include test-qapi-visit.h
 +#include qemu-objects.h
 +
 +typedef struct TestInputVisitorData {
 +    QObject *obj;
 +    QmpInputVisitor *qiv;
 +} 

Re: [Qemu-devel] [PATCH v2 0/6] ARM: AREG0 conversion

2012-03-29 Thread Laurent Desnogues
On Tue, Mar 27, 2012 at 9:59 PM, Artyom Tarasenko atar4q...@gmail.com wrote:
 On Tue, Mar 27, 2012 at 7:01 PM, Laurent Desnogues
 laurent.desnog...@gmail.com wrote:
 On Tue, Mar 27, 2012 at 6:48 PM, Blue Swirl blauwir...@gmail.com wrote:
 On Tue, Mar 27, 2012 at 13:40, Laurent Desnogues
 laurent.desnog...@gmail.com wrote:
 On Mon, Mar 26, 2012 at 7:02 PM, Blue Swirl blauwir...@gmail.com wrote:
 [...]
 At least stack protector is protecting more code than before (for
 example TLB miss handler), but could overhead from that amount to 5%?

 Otherwise there should be just a few extra register moves here and
 there, that should be cheap on modern processors.

 The extra moves might be cheap but their cost is obviously not 0:
 on top of using extra CPU core resources, code size is increased
 which results in more instruction cache misses.

 I didn't like the idea when we discussed it back in May, now it
 looks like we have concrete evidence the speed impact is
 measurable (though I'd like some more numbers than the rough
 5% estimate I gave).

 A clearly defined test case running on a host that does not adjust
 clock frequencies would be nice. It would be interesting to find out
 where exactly the slowdown comes from.

 Perhaps the access helpers ({helper,_}_{ld,st}{b,w,l}_mmu) generated
 by softmmu_template.h are the culprit. If so, they could be split from
 other code and moved to TCG back ends. That way the interface could be
 improved while keeping all other cleanups.

 I also get a slowdown running in user mode, so I don't think
 improving the mmu ld/st will completely remove the issue.
 In that case the slowdown comes from the extra move
 instructions for helper calls.  The ARM target uses way too
 many helpers, but that's another discussion :-)


 Have you tried compiling without -fstack-protector-all as suggested by Lluís?
 I observe a similar slowdown on a sparc target, and there compiling
 without stack protection definitely helps.

That will indeed probably make the real problem, which is that
this patch increases the size of generated code, less obvious
on small benchmarks that don't put pressure on instruction
cache.  But the fact is that generated code is larger and will
have to execute more instructions, so no matter what you do,
this will have an impact on speed.


Laurent



Re: [Qemu-devel] [PATCH v2 0/6] ARM: AREG0 conversion

2012-03-27 Thread Laurent Desnogues
On Mon, Mar 26, 2012 at 7:02 PM, Blue Swirl blauwir...@gmail.com wrote:
[...]
 At least stack protector is protecting more code than before (for
 example TLB miss handler), but could overhead from that amount to 5%?

 Otherwise there should be just a few extra register moves here and
 there, that should be cheap on modern processors.

The extra moves might be cheap but their cost is obviously not 0:
on top of using extra CPU core resources, code size is increased
which results in more instruction cache misses.

I didn't like the idea when we discussed it back in May, now it
looks like we have concrete evidence the speed impact is
measurable (though I'd like some more numbers than the rough
5% estimate I gave).


Laurent



Re: [Qemu-devel] [PATCH v2 0/6] ARM: AREG0 conversion

2012-03-27 Thread Laurent Desnogues
On Tue, Mar 27, 2012 at 6:48 PM, Blue Swirl blauwir...@gmail.com wrote:
 On Tue, Mar 27, 2012 at 13:40, Laurent Desnogues
 laurent.desnog...@gmail.com wrote:
 On Mon, Mar 26, 2012 at 7:02 PM, Blue Swirl blauwir...@gmail.com wrote:
 [...]
 At least stack protector is protecting more code than before (for
 example TLB miss handler), but could overhead from that amount to 5%?

 Otherwise there should be just a few extra register moves here and
 there, that should be cheap on modern processors.

 The extra moves might be cheap but their cost is obviously not 0:
 on top of using extra CPU core resources, code size is increased
 which results in more instruction cache misses.

 I didn't like the idea when we discussed it back in May, now it
 looks like we have concrete evidence the speed impact is
 measurable (though I'd like some more numbers than the rough
 5% estimate I gave).

 A clearly defined test case running on a host that does not adjust
 clock frequencies would be nice. It would be interesting to find out
 where exactly the slowdown comes from.

 Perhaps the access helpers ({helper,_}_{ld,st}{b,w,l}_mmu) generated
 by softmmu_template.h are the culprit. If so, they could be split from
 other code and moved to TCG back ends. That way the interface could be
 improved while keeping all other cleanups.

I also get a slowdown running in user mode, so I don't think
improving the mmu ld/st will completely remove the issue.
In that case the slowdown comes from the extra move
instructions for helper calls.  The ARM target uses way too
many helpers, but that's another discussion :-)


Laurent



Re: [Qemu-devel] [PATCH 1/6] arm: move neon_tbl to neon_helper.c

2012-03-20 Thread Laurent Desnogues
On Mon, Mar 19, 2012 at 11:10 PM, Peter Maydell
peter.mayd...@linaro.org wrote:
 On 19 March 2012 21:56, Blue Swirl blauwir...@gmail.com wrote:
 -DEF_HELPER_4(neon_tbl, i32, i32, i32, i32, i32)
 +DEF_HELPER_5(neon_tbl, i32, env, i32, i32, i32, i32)

 --- a/target-arm/translate.c
 +++ b/target-arm/translate.c
 @@ -6340,7 +6340,7 @@ static int disas_neon_data_insn(CPUARMState *
 env, DisasContext *s, uint32_t ins
                 tmp2 = neon_load_reg(rm, 0);
                 tmp4 = tcg_const_i32(rn);
                 tmp5 = tcg_const_i32(n);
 -                gen_helper_neon_tbl(tmp2, tmp2, tmp, tmp4, tmp5);
 +                gen_helper_neon_tbl(cpu_env, tmp2, tmp2, tmp, tmp4, tmp5);
                 tcg_temp_free_i32(tmp);
                 if (insn  (1  6)) {
                     tmp = neon_load_reg(rd, 1);
 @@ -6349,7 +6349,7 @@ static int disas_neon_data_insn(CPUARMState *
 env, DisasContext *s, uint32_t ins
                     tcg_gen_movi_i32(tmp, 0);
                 }
                 tmp3 = neon_load_reg(rm, 1);
 -                gen_helper_neon_tbl(tmp3, tmp3, tmp, tmp4, tmp5);
 +                gen_helper_neon_tbl(cpu_env, tmp3, tmp3, tmp, tmp4, tmp5);
                 tcg_temp_free_i32(tmp5);
                 tcg_temp_free_i32(tmp4);
                 neon_store_reg(rd, 0, tmp2);

 ...shouldn't these be
  gen_helper_neon_tbl(tmp3, cpu_env, tmp3, tmp, tmp4, tmp5);

Indeed.  Compiling with --enable-debug doesn't work.


Laurent



Re: [Qemu-devel] [PATCH 0/6] ARM: AREG0 conversion

2012-03-20 Thread Laurent Desnogues
On Mon, Mar 19, 2012 at 10:55 PM, Blue Swirl blauwir...@gmail.com wrote:
 Convert ARM to AREG0 free operation. Survives simple tests.

After fixing the issue about tbl helper usage, I could run some
simple linux-user tests and boot a rather large Linux image.
It looks like the kernel boot is about 5% slower.


Laurent

 URL     git://repo.or.cz/qemu/blueswirl.git
        http://repo.or.cz/r/qemu/blueswirl.git

 Blue Swirl (6):
  arm: move neon_tbl to neon_helper.c
  arm: move saturating arithmetic to helper.c
  arm: move other arithmetic to helper.c
  arm: move cpsr and banked register access to helper.c
  arm: move exception and wfi helpers to helper.c
  arm: move load and store helpers, switch to AREG0 free mode

  Makefile.target          |    4 +-
  configure                |    2 +-
  target-arm/helper.c      |  388 +-
  target-arm/helper.h      |   60 
  target-arm/neon_helper.c |   22 +++
  target-arm/op_helper.c   |  430 
 --
  target-arm/translate.c   |  148 
  7 files changed, 513 insertions(+), 541 deletions(-)
  delete mode 100644 target-arm/op_helper.c

 --
 1.7.9




Re: [Qemu-devel] [PATCH] target-arm: Decode SETEND correctly in Thumb

2012-03-13 Thread Laurent Desnogues
On Tue, Mar 13, 2012 at 3:45 PM, Peter Maydell peter.mayd...@linaro.org wrote:
 Decode the SETEND instruction correctly in Thumb mode,
 rather than accidentally treating it like CPS. We don't
 support BE8 mode, but this change brings the Thumb mode
 in to line with behaviour in ARM mode: 'SETEND BE' is
 not supported and will provoke an UNDEF exception, but
 'SETEND LE' is correctly handled as a no-op.

 Signed-off-by: Peter Maydell peter.mayd...@linaro.org
 Reported-by: Daniel Forsgren daniel.forsg...@enea.com

The patch looks good to me;  I guess this would qualify for a:

Reviewed-by: Laurent Desnogues laurent.desnog...@gmail.com

The only point I dislike isn't directly related to this patch:
the use of illegal_op that behaves exactly as UNDEF
looks odd.


Laurent

 ---
 This is one of those patches where I wasn't sure whether to try
 to split it into a whitespace-only part and a significant-change
 part. Most of this is (a) indenting existing code another notch
 for the extra switch statement and (b) adding braces to placate
 checkpatch.

  target-arm/translate.c |   63 ++-
  1 files changed, 40 insertions(+), 23 deletions(-)

 diff --git a/target-arm/translate.c b/target-arm/translate.c
 index 280bfca..3196619 100644
 --- a/target-arm/translate.c
 +++ b/target-arm/translate.c
 @@ -9704,32 +9704,49 @@ static void disas_thumb_insn(CPUState *env, 
 DisasContext *s)
             store_reg(s, rd, tmp);
             break;

 -        case 6: /* cps */
 -            ARCH(6);
 -            if (IS_USER(s))
 +        case 6:
 +            switch ((insn  5)  7) {
 +            case 2:
 +                /* setend */
 +                ARCH(6);
 +                if (insn  (1  3)) {
 +                    /* BE8 mode not implemented.  */
 +                    goto illegal_op;
 +                }
                 break;
 -            if (IS_M(env)) {
 -                tmp = tcg_const_i32((insn  (1  4)) != 0);
 -                /* FAULTMASK */
 -                if (insn  1) {
 -                    addr = tcg_const_i32(19);
 -                    gen_helper_v7m_msr(cpu_env, addr, tmp);
 -                    tcg_temp_free_i32(addr);
 +            case 3:
 +                /* cps */
 +                ARCH(6);
 +                if (IS_USER(s)) {
 +                    break;
                 }
 -                /* PRIMASK */
 -                if (insn  2) {
 -                    addr = tcg_const_i32(16);
 -                    gen_helper_v7m_msr(cpu_env, addr, tmp);
 -                    tcg_temp_free_i32(addr);
 +                if (IS_M(env)) {
 +                    tmp = tcg_const_i32((insn  (1  4)) != 0);
 +                    /* FAULTMASK */
 +                    if (insn  1) {
 +                        addr = tcg_const_i32(19);
 +                        gen_helper_v7m_msr(cpu_env, addr, tmp);
 +                        tcg_temp_free_i32(addr);
 +                    }
 +                    /* PRIMASK */
 +                    if (insn  2) {
 +                        addr = tcg_const_i32(16);
 +                        gen_helper_v7m_msr(cpu_env, addr, tmp);
 +                        tcg_temp_free_i32(addr);
 +                    }
 +                    tcg_temp_free_i32(tmp);
 +                    gen_lookup_tb(s);
 +                } else {
 +                    if (insn  (1  4)) {
 +                        shift = CPSR_A | CPSR_I | CPSR_F;
 +                    } else {
 +                        shift = 0;
 +                    }
 +                    gen_set_psr_im(s, ((insn  7)  6), 0, shift);
                 }
 -                tcg_temp_free_i32(tmp);
 -                gen_lookup_tb(s);
 -            } else {
 -                if (insn  (1  4))
 -                    shift = CPSR_A | CPSR_I | CPSR_F;
 -                else
 -                    shift = 0;
 -                gen_set_psr_im(s, ((insn  7)  6), 0, shift);
 +                break;
 +            default:
 +                goto undef;
             }
             break;

 --
 1.7.1





Re: [Qemu-devel] [MASCOT CONTEST] Alex Bradbury #1

2012-02-15 Thread Laurent Desnogues
On Wed, Feb 15, 2012 at 3:31 PM, Anthony Liguori anth...@codemonkey.ws wrote:
 Please respond to this note with an '+1', or an Ack, to vote for this icon.

Ack


Laurent



Re: [Qemu-devel] profiling qemu

2012-02-14 Thread Laurent Desnogues
2012/2/14 Lluís Vilanova vilan...@ac.upc.edu:
 Artyom Tarasenko writes:
[...]
 Here it looks like compute_all_sub and compute_all_sub_xcc are
 good candidates for optimizing: together they take the same amount of
 time as cpu_sparc_exec. I guess both operations would be trivial in
 the x86_64 assembler. What would be the best strategy to make TCG take
 the advantage of running on a x86_64 host?

 A quick look into the code reveals that these two are called from a TCG helper
 (helper_compute_psr), so I see two approaches here applicable to the most
 frequently used sub-operations in helper_compute_psr:

 * Define new simpler helpers for those sub-operations that can be declared 
 with
  TCG_CALL_CONST and generate the new psr/xcc values in temporal registers. You
  must make sure any other code will still be able to use the new psr/xcc
  values.

 * Reimplement these sub-operations in pure TCG code.


 But first, make sure you run a proper benchmark to establish where are the
 hotspots in the sparc code for QEMU. The problem here is to establish what a
 proper benchmark is :)

Similar helpers are used in ARM translation, so I'm not surprised
they show up (typically sub/flag instructions are used for loops).

A good strategy is indeed to generate TCG code and let the
NZ/C/etc. be global temps as other CPU registers.  This gains a
few percents of speed.

HTH,

Laurent



Re: [Qemu-devel] profiling qemu

2012-02-14 Thread Laurent Desnogues
On Tue, Feb 14, 2012 at 4:15 PM, Artyom Tarasenko atar4q...@gmail.com wrote:
 2012/2/14 Laurent Desnogues laurent.desnog...@gmail.com:
 2012/2/14 Lluís Vilanova vilan...@ac.upc.edu:
 Artyom Tarasenko writes:
 [...]
 Here it looks like compute_all_sub and compute_all_sub_xcc are
 good candidates for optimizing: together they take the same amount of
 time as cpu_sparc_exec. I guess both operations would be trivial in
 the x86_64 assembler. What would be the best strategy to make TCG take
 the advantage of running on a x86_64 host?

 A quick look into the code reveals that these two are called from a TCG 
 helper
 (helper_compute_psr), so I see two approaches here applicable to the most
 frequently used sub-operations in helper_compute_psr:

 * Define new simpler helpers for those sub-operations that can be declared 
 with
  TCG_CALL_CONST and generate the new psr/xcc values in temporal registers. 
 You
  must make sure any other code will still be able to use the new psr/xcc
  values.

 * Reimplement these sub-operations in pure TCG code.


 But first, make sure you run a proper benchmark to establish where are the
 hotspots in the sparc code for QEMU. The problem here is to establish what a
 proper benchmark is :)

 Similar helpers are used in ARM translation, so I'm not surprised
 they show up (typically sub/flag instructions are used for loops).

 A good strategy is indeed to generate TCG code and let the
 NZ/C/etc. be global temps as other CPU registers.  This gains a
 few percents of speed.

 Can you give an example, where global temp would be faster than an
 inline helper? At the first sight it's trading a cheap math operation
 (in case of sub, a few cheap math operations in case of subx) against
 a memory access. Or do you mean, use the global flag registers instead
 of CC_SRC{1,2} and always compute them?

I mean that for my work on ARM I just declared the NZ/C/V fields
as other registers (tcg_global_mem_new) and let the TCG
optimizer do the work.  I can't say if that would play well with
the SPARC front-end that mimics the way x86 does flag handling.
(I would argue that on x86 you have no choice because most
instructions do touch flags, while it's not true on SPARC, IIRC.)

I am afraid you'll have to try yourself and see if you gain
something :-)


Laurent



Re: [Qemu-devel] [PATCH 01/19] linux-user: stack_base is now mandatory on all targets

2012-02-08 Thread Laurent Desnogues
On Fri, Feb 3, 2012 at 3:49 PM,  riku.voi...@linaro.org wrote:
 From: Riku Voipio riku.voi...@linaro.org

 Signed-off-by: Riku Voipio riku.voi...@linaro.org
 ---
  linux-user/qemu.h |    2 +-
  1 files changed, 1 insertions(+), 1 deletions(-)

 diff --git a/linux-user/qemu.h b/linux-user/qemu.h
 index 55ad9d8..30e2abd 100644
 --- a/linux-user/qemu.h
 +++ b/linux-user/qemu.h
 @@ -123,10 +123,10 @@ typedef struct TaskState {
  #endif
  #if defined(TARGET_ARM) || defined(TARGET_M68K) || defined(TARGET_UNICORE32)
     /* Extra fields for semihosted binaries.  */
 -    uint32_t stack_base;
     uint32_t heap_base;
     uint32_t heap_limit;
  #endif
 +    uint32_t stack_base;

Shouldn't this be abi_ulong instead of uint32_t?

     int used; /* non zero if used */
     struct image_info *info;
     struct linux_binprm *bprm;

Laurent



Re: [Qemu-devel] icount and tb chaining

2012-01-17 Thread Laurent Desnogues
On Tue, Jan 17, 2012 at 7:50 PM, Peter Maydell peter.mayd...@linaro.org wrote:
 2012/1/13 James Greensky gsk...@gmail.com:
 Sure, usually a tb chain is setup after a subsequent tb is
 found/constructed in the loop in cpu_exec when a tb returns.
 Taken/non-taken branch chaining is implemented by indicating the
 branch direction by the two least significant digits of the the
 previously returned tb. This is usually 0/1. When running icount, you
 can also get a 2 value in these least significant digits, indicating
 that the translation block was restarted due to the
 icount_decr.u16.low field being exhausted but having instructions left
 to execute in icount_extra. This 2 value falls through to tb_add_jump,
 which then updates the tb's jmp_first field, as both tb and next_tb
 refer to the same translation block. My question is why is this
 necessary, why not do nothing, and leave the previous chaining intact?

 This looks suspiciously like a bug to me, although the code
 is a bit opaque to me. Calling tb_add_jump() with n==2 seems like
 it's not going to work since tb_set_jmp_target() is going to fall
 off the end of either tb_jmp_offset[] or tb_next[], so even if we're
 playing clever games with jmp_first being treatable as jmp_next[2] for
 some purposes there's going to be a problem.

 I thought the 2 return from a TB execution was only used as a flag
 value, which would suggest that we should clear next_tb in the
 'refill decrementer' code path too. I'm not sure I'm not missing
 something subtle, though.

Hmm I wonder if I didn't miss something in the discussion
because the only call to tb_add_jmp clears the lower two
bits so I fail to see the issue.


Laurent



Re: [Qemu-devel] icount and tb chaining

2012-01-17 Thread Laurent Desnogues
On Tue, Jan 17, 2012 at 8:06 PM, Laurent Desnogues
laurent.desnog...@gmail.com wrote:
 On Tue, Jan 17, 2012 at 7:50 PM, Peter Maydell peter.mayd...@linaro.org 
 wrote:
 2012/1/13 James Greensky gsk...@gmail.com:
 Sure, usually a tb chain is setup after a subsequent tb is
 found/constructed in the loop in cpu_exec when a tb returns.
 Taken/non-taken branch chaining is implemented by indicating the
 branch direction by the two least significant digits of the the
 previously returned tb. This is usually 0/1. When running icount, you
 can also get a 2 value in these least significant digits, indicating
 that the translation block was restarted due to the
 icount_decr.u16.low field being exhausted but having instructions left
 to execute in icount_extra. This 2 value falls through to tb_add_jump,
 which then updates the tb's jmp_first field, as both tb and next_tb
 refer to the same translation block. My question is why is this
 necessary, why not do nothing, and leave the previous chaining intact?

 This looks suspiciously like a bug to me, although the code
 is a bit opaque to me. Calling tb_add_jump() with n==2 seems like
 it's not going to work since tb_set_jmp_target() is going to fall
 off the end of either tb_jmp_offset[] or tb_next[], so even if we're
 playing clever games with jmp_first being treatable as jmp_next[2] for
 some purposes there's going to be a problem.

 I thought the 2 return from a TB execution was only used as a flag
 value, which would suggest that we should clear next_tb in the
 'refill decrementer' code path too. I'm not sure I'm not missing
 something subtle, though.

 Hmm I wonder if I didn't miss something in the discussion
 because the only call to tb_add_jmp clears the lower two
 bits so I fail to see the issue.

Hmm I knew I was missing something, forget this :-)


Laurent



Re: [Qemu-devel] [PATCH] target-i386: Fix regression with maxsd SSE2 instruction

2011-11-08 Thread Laurent Desnogues
On Tue, Nov 8, 2011 at 2:00 PM, Jason Wessel jason.wes...@windriver.com wrote:
 The maxsd instruction needs to take into account the sign of the
 numbers 64 bit numbers.  This is a regression that was introduced in
 347ac8e356 (target-i386: switch to softfloat).

 The case that fails is:

 maxsd  %xmm1,%xmm0

 When xmm1 = 24 and xmm0 = -100

 This was found running the glib2 binding tests where it prints the message:
 /binding/transform:
 GLib-GObject-WARNING **: value 24.00 of type `gdouble' is invalid or 
 out of range for property `value' of type `gdouble'
 aborting...

 Using a signed comparison fixes the problem.

 Signed-off-by: Jason Wessel jason.wes...@windriver.com
 ---
  target-i386/ops_sse.h |    4 ++--
  1 files changed, 2 insertions(+), 2 deletions(-)

 diff --git a/target-i386/ops_sse.h b/target-i386/ops_sse.h
 index aa41d25..bcc0ed9 100644
 --- a/target-i386/ops_sse.h
 +++ b/target-i386/ops_sse.h
 @@ -584,8 +584,8 @@ void helper_ ## name ## sd (Reg *d, Reg *s)\
  #define FPU_SUB(size, a, b) float ## size ## _sub(a, b, env-sse_status)
  #define FPU_MUL(size, a, b) float ## size ## _mul(a, b, env-sse_status)
  #define FPU_DIV(size, a, b) float ## size ## _div(a, b, env-sse_status)
 -#define FPU_MIN(size, a, b) (a)  (b) ? (a) : (b)
 -#define FPU_MAX(size, a, b) (a)  (b) ? (a) : (b)

Isn't maxsd a floating-point instruction?  If so, shouldn't
FPU_{MIN,MAX} use softfloat operations?


Laurent

 +#define FPU_MIN(size, a, b) (int ## size ## _t)(a)  (int ## size ## _t)(b) 
 ? (a) : (b)
 +#define FPU_MAX(size, a, b) (int ## size ## _t)(a)  (int ## size ## _t)(b) 
 ? (a) : (b)
  #define FPU_SQRT(size, a, b) float ## size ## _sqrt(b, env-sse_status)

  SSE_HELPER_S(add, FPU_ADD)
 --
 1.7.1






Re: [Qemu-devel] [PATCH] tcg: Remove stack protection from helper functions

2011-09-26 Thread Laurent Desnogues
On Mon, Sep 26, 2011 at 10:01 AM, Mulyadi Santosa
mulyadi.sant...@gmail.com wrote:
 Hi...

 On Mon, Sep 26, 2011 at 14:46, Jan Kiszka jan.kis...@siemens.com wrote:
 This increases the overhead of frequently executed helpers.

 Signed-off-by: Jan Kiszka jan.kis...@siemens.com

 IMHO, stack protector setup put more stuffs during epilogue, but quite
 likely it is negligible unless it cause too much L1 cache misses. So,
 I think this micro tuning is somewhat unnecessary but still okay.

The impact of stack protection is very high for instance running
FFmpeg ARM with NEON optimizations:  a few months ago I
measured that removing stack protection improved the run time
by more than 10%.  Of course it's extreme since the proportion
of NEON instructions (and hence of helper calls) is very high.


Laurent



Re: [Qemu-devel] [HelenOS-devel] [sparc64] Miscomputed minimum of a group of numbers in sparc64 emulation

2011-07-01 Thread Laurent Desnogues
On Fri, Jul 1, 2011 at 2:57 PM, Jakub Jermar ja...@jermar.eu wrote:
[...]
 When _not_ singlestepping via GDB's `stepi`, the testcase will fail and
 crash Qemu like this:

 qemu: fatal: Trap 0x0101 while trap level (5) = MAXTL (5), Error state
 register dumped here

 On the other hand, when I attach GDB to Qemu and step through all
 instructions using `stepi`, the testcase will succeed and crash Qemu
 like this:

 qemu: fatal: Trap 0x0100 while trap level (5) = MAXTL (5), Error state
 registers dumped here

 Mind the difference in the trap type - 0x100 for success, 0x101 for failure.

 This is how I run the test:

 Without GDB:
 $ qemu-system-sparc64 -bios ./testcase

 With GDB:
 $ qemu-system-sparc64 -bios ./testcase -s -S

 From another terminal:
 $ /usr/local/cross/sparc64/bin/sparc64-linux-gnu-gdb
 (gdb) set architecture sparc:v9
 (gdb) target remote localhost:1234
 (gdb) stepi
 ...

 Hope this helps to fix the problem.

You don't have to use gdb to reproduce the issue, just add -singlestep
when running qemu.

I find it odd that udivx is using cpu_cc_src and cpu_cc_src2.  Using
dedicated local temps seems to fix the issue.

HTH,

Laurent



Re: [Qemu-devel] [HelenOS-devel] [sparc64] Miscomputed minimum of a group of numbers in sparc64 emulation

2011-07-01 Thread Laurent Desnogues
On Fri, Jul 1, 2011 at 4:11 PM, Jakub Jermar ja...@jermar.eu wrote:
[...]
 Actually, the testcase can be further reduced into:

 .global _start

 .text

 .space 0x20

 _start:
        set 110393, %i1
        set 0x40, %i2

        cmp  %i1, %i2
        udivx  %g0, 1, %g0
        movgu  %xcc, %i2, %i1
        cmp  %i1, 512
        bgu  %xcc, 0f
        nop

 succ:
        ta 0

 fail:
 0:
        ta 1

 The presence of the `udivx` instruction seems to be essential. Even
 though it has no effect on the computation, removing it will make the
 testcase non-reproducible.

Could you try to replace udivx with sdivx?  It looks wrong too.


Laurent



Re: [Qemu-devel] [HelenOS-devel] [sparc64] Miscomputed minimum of a group of numbers in sparc64 emulation

2011-07-01 Thread Laurent Desnogues
On Fri, Jul 1, 2011 at 5:03 PM, Artyom Tarasenko atar4q...@gmail.com wrote:
[...]
 I find it odd that udivx is using cpu_cc_src and cpu_cc_src2.  Using
 dedicated local temps seems to fix the issue.

 Do we need to copy cpu_src* to further temps at all? IMHO

 -                        tcg_gen_mov_tl(cpu_cc_src, cpu_src1);
 -                        tcg_gen_mov_tl(cpu_cc_src2, cpu_src2);
 -                        gen_trap_ifdivzero_tl(cpu_cc_src2);
 -                        tcg_gen_divu_i64(cpu_dst, cpu_cc_src, cpu_cc_src2);
 +                        gen_trap_ifdivzero_tl(cpu_src2);
 +                        tcg_gen_divu_i64(cpu_dst, cpu_src1, cpu_src2);

 should do it. Or cpu_src is what you mean by dedicated?

You have to use two local temps here to store cpu_src1
and cpu_src2 because gen_trap_ifdivzero_tl uses
tcg_gen_brcondi_tl (cf tcg/README comment about
local usage and jumps).  Note you'll have to do something
similar in gen_op_sdivx.


Laurent



Re: [Qemu-devel] [PATCH][sparc64] fix cpu_cc_src and cpu_cc_src2 corruption in udivx and sdivx

2011-07-01 Thread Laurent Desnogues
On Fri, Jul 1, 2011 at 9:28 PM, Artyom Tarasenko atar4q...@gmail.com wrote:
 udivx and sdvix don't modify condition flags, so they shall not
 overwrite cpu_cc_*

Looks good to me.


Laurent

 Signed-off-by: Artyom Tarasenko atar4q...@gmail.com
 ---
  target-sparc/translate.c |   32 ++--
  1 files changed, 22 insertions(+), 10 deletions(-)

 diff --git a/target-sparc/translate.c b/target-sparc/translate.c
 index 992cd77..f32a674 100644
 --- a/target-sparc/translate.c
 +++ b/target-sparc/translate.c
 @@ -727,19 +727,24 @@ static inline void gen_trap_ifdivzero_tl(TCGv divisor)
  static inline void gen_op_sdivx(TCGv dst, TCGv src1, TCGv src2)
  {
     int l1, l2;
 +    TCGv r_temp1, r_temp2;

     l1 = gen_new_label();
     l2 = gen_new_label();
 -    tcg_gen_mov_tl(cpu_cc_src, src1);
 -    tcg_gen_mov_tl(cpu_cc_src2, src2);
 -    gen_trap_ifdivzero_tl(cpu_cc_src2);
 -    tcg_gen_brcondi_tl(TCG_COND_NE, cpu_cc_src, INT64_MIN, l1);
 -    tcg_gen_brcondi_tl(TCG_COND_NE, cpu_cc_src2, -1, l1);
 +    r_temp1 = tcg_temp_local_new();
 +    r_temp2 = tcg_temp_local_new();
 +    tcg_gen_mov_tl(r_temp1, src1);
 +    tcg_gen_mov_tl(r_temp2, src2);
 +    gen_trap_ifdivzero_tl(r_temp2);
 +    tcg_gen_brcondi_tl(TCG_COND_NE, r_temp1, INT64_MIN, l1);
 +    tcg_gen_brcondi_tl(TCG_COND_NE, r_temp2, -1, l1);
     tcg_gen_movi_i64(dst, INT64_MIN);
     tcg_gen_br(l2);
     gen_set_label(l1);
 -    tcg_gen_div_i64(dst, cpu_cc_src, cpu_cc_src2);
 +    tcg_gen_div_i64(dst, r_temp1, r_temp2);
     gen_set_label(l2);
 +    tcg_temp_free(r_temp1);
 +    tcg_temp_free(r_temp2);
  }
  #endif

 @@ -3173,10 +3178,17 @@ static void disas_sparc_insn(DisasContext * dc)
                         break;
  #ifdef TARGET_SPARC64
                     case 0xd: /* V9 udivx */
 -                        tcg_gen_mov_tl(cpu_cc_src, cpu_src1);
 -                        tcg_gen_mov_tl(cpu_cc_src2, cpu_src2);
 -                        gen_trap_ifdivzero_tl(cpu_cc_src2);
 -                        tcg_gen_divu_i64(cpu_dst, cpu_cc_src, cpu_cc_src2);
 +                        {
 +                            TCGv r_temp1, r_temp2;
 +                            r_temp1 = tcg_temp_local_new();
 +                            r_temp2 = tcg_temp_local_new();
 +                            tcg_gen_mov_tl(r_temp1, cpu_src1);
 +                            tcg_gen_mov_tl(r_temp2, cpu_src2);
 +                            gen_trap_ifdivzero_tl(r_temp2);
 +                            tcg_gen_divu_i64(cpu_dst, r_temp1, r_temp2);
 +                            tcg_temp_free(r_temp1);
 +                            tcg_temp_free(r_temp2);
 +                        }
                         break;
  #endif
                     case 0xe: /* udiv */
 --
 1.7.3.4





Re: [Qemu-devel] Actual TB code doesn't look like what was intended (TCG issue)?

2011-06-24 Thread Laurent Desnogues
On Fri, Jun 24, 2011 at 4:44 AM, Max Filippov jcmvb...@gmail.com wrote:
 Hello guys.

 I'm running qemu on x86_64 host.
 It's clean build from git sources dated 2011.05.19, commit 
 1fddfba129f5435c80eda14e8bc23fdb888c7187
 I have the following output from log trace,op,out_asm:

 Trace 0x4000a310 [d0026c92]
 OP:
   0xd0c0
  movi_i32 tmp1,$0xfff4
  add_i32 tmp0,ar9,tmp1
  qemu_ld32 ar1,tmp0,$0x0

   0xd0c3
  movi_i32 tmp1,$0xfff0
  add_i32 tmp0,ar9,tmp1
  qemu_ld32 ar0,tmp0,$0x0

 [...snip...]
[...]
 0x4000a360:  xor%esi,%esi
 0x4000a362:  callq  0x52edc2
[...]
 (gdb) x/25i 0x4000a330
[...]
   0x4000a360:  mov    $0x1,%esi
   0x4000a365:  callq  0x52edc2 __ldl_mmu
   0x4000a36a:  mov    %eax,%ebp
   0x4000a36c:  sub    $0x44,%al
 = 0x4000a36e:  lea    -0x10(%rbx),%esp
   0x4000a371:  mov    %ebp,0xc(%r14)
   0x4000a375:  mov    %r12d,%esi
   0x4000a378:  mov    %r12d,%edi

 Please note how the current instruction in gdb differ from what was said in 
 OUT. This lea corrupts stack pointer and the next callq generates segfault.
 Could please anyone familiar with TCG take a look at this, or suggest where I 
 should look myself?

As Peter hinted, you're not looking at the code you think :-)
Note how your original TCG code does loads:

   qemu_ld32 ar1,tmp0,$0x0

That $0x0 will end up in %RSI.  It's the mem index used to
distinguish from user and privileged level accesses.  In your
examples of host code, in one case it is 0 and in the other
it is 1, so you're definitely not really looking at the same
block in the same running conditions.

HTH,

Laurent



Re: [Qemu-devel] Actual TB code doesn't look like what was intended (TCG issue)?

2011-06-24 Thread Laurent Desnogues
On Fri, Jun 24, 2011 at 10:35 AM, Max Filippov jcmvb...@gmail.com wrote:
[...]
 Yes, I've noticed it (however, after I sent this mail).
 But (1) quoted OUT is the last OUT for this host address range in the log and 
 (2) in gdb I set b tlb_fill if retaddr == 0x4000a369 and made some steps.
 You mean that I should look at previous OUTs for this address range?

That should be the last block matching the address in the
log output if you run *under gdb* with -d out_asm.

BTW you say this is a clean build, but as far as I could see
it looks like your emulated target is not part of standard
QEMU;  it seems to have a register named 'ar9'.  Did I
miss something or is it some non public target? ia64?


Laurent



Re: [Qemu-devel] [RFC][PATCH v0 0/8] Improve register allocator

2011-05-24 Thread Laurent Desnogues
On Tue, May 24, 2011 at 1:31 PM, Kirill Batuzov batuz...@ispras.ru wrote:
[...]
 Gathered statistics shows some interesting things too. I've run matrix
 multiplication benchmark (guest - ARM, host - x86, linux-user mode, with
 my patches applied) and here are the results:

 spill count         3916
  real spills       32
  spills at bb end  1023
  spills at call:
    globals         2755
    iarg passing    0
    call cloobers   106

 Real spills are spills generated by register allocator when it runs out
 of registers.  They are less than 1% of all spills.  Other tests show
 similar behavior.

When you write host x86, do you mean IA32 or x86_64?
That might change the number of real spills a lot if you meant
x86_64.

 I think any further improvements to register allocator without leveling
 conventions about saving globals at calls and BB ends somehow is
 useless.

 Currently we are looking if we can pass some globals on registers
 through basic block boundaries (inside one TB of course).

If by basic block, you mean BB as implied by TCG br for
instance, I'm not sure all guests will benefit a lot.  If you
mean that you intend on putting several guests BB in a
single TB then I guess you'll have to first collect dynamic
statistics before dynamically switching to grouping BB.


Laurent



Re: [Qemu-devel] [PATCH 9/9] cpu-exec.c: avoid AREG0 use

2011-05-22 Thread Laurent Desnogues
On Sun, May 22, 2011 at 7:10 PM, Peter Maydell peter.mayd...@linaro.org wrote:
 On 22 May 2011 17:55, Blue Swirl blauwir...@gmail.com wrote:
 For ARM, the handcrafted instructions below need to be changed to save also 
 r7:
    /* stmdb sp!, { r4 - r6, r8 - r11, lr } */
    tcg_out32(s, (COND_AL  28) | 0x092d4f70);

    /* ldmia sp!, { r4 - r6, r8 - r11, pc } */
    tcg_out32(s, (COND_AL  28) | 0x08bd8f70);

 That would be ...ff0 rather than ...f70 in both cases
 (bottom 16 bits are a bit map of registers being saved/loaded):

    /* stmdb sp!, { r4 - r11, lr } */
    tcg_out32(s, (COND_AL  28) | 0x092d4ff0);

    /* ldmia sp!, { r4 - r11, pc } */
    tcg_out32(s, (COND_AL  28) | 0x08bd8ff0);

Shouldn't you extend the range to include r12, due to
the 8-byte alignment restriction of the stack?


Laurent



Re: [Qemu-devel] [PATCH 0/6] Implement constant folding and copy propagation in TCG

2011-05-21 Thread Laurent Desnogues
On Sat, May 21, 2011 at 1:31 AM, Andreas Färber andreas.faer...@web.de wrote:
[...]
 Has anyone evaluated reusing LLVM optimization passes for TCG? Or maybe
 GIMPL if there's an equivalent?

IMHO the qemu_ld/st semantics and the size of TB blocks
will always limit the usefulness of more involved
optimizations than what already exists in QEMU.

Basically qemu_ld/st ops prevent any code change across
them which make optimization opportunities rather low.
Considering they always take the fast path would surely
change the situation, but then the slow path would have to
be able to rebuild information;  certainly possible, but
quite involved too :-)

Once this qemu_ld/st change is done, you'll the need to
get to the problem of too short TB blocks;  a way to do
this is to generate traces of frequently executed blocks
and consider them as a single block[1];  that would open
the door to many more code generation improvements.

This certainly doesn't mean Kirill and Aurélien changes
are not welcome, just that I think, in the current state,
such changes will always have a limited impact.


Laurent

[1] See for instance section 2.3 of Derek Bruening's
PhD thesis about DynamoRIO
http://www.burningcutlery.com/derek/phd.html



Re: [Qemu-devel] [PATCH RFC 00/11] AREG0 elimination

2011-05-15 Thread Laurent Desnogues
On Sun, May 15, 2011 at 9:15 AM, Blue Swirl blauwir...@gmail.com wrote:
 On Sun, May 15, 2011 at 1:04 AM, Aurelien Jarno aurel...@aurel32.net wrote:
 On Sun, May 15, 2011 at 12:52:35AM +0300, Blue Swirl wrote:
 On Sun, May 15, 2011 at 12:16 AM, Aurelien Jarno aurel...@aurel32.net 
 wrote:
  On Sat, May 14, 2011 at 10:35:20PM +0300, Blue Swirl wrote:
[...]
  The env register is used very often (basically for every load/store, but
  also a lot of helpers), so it makes sense to reserve a register for it.
 
  For what I understand from your patch series, you prefer to pass this
  register explicitly to TCG functions. This basically means this TCG
  global will be loaded to host register as soon as it is used, but also
  regularly, as globals are saved back to their canonical location before
  an helper or a load/store.
 
  So it seems that this patch series will just allowing the env register
  to change over time, though it will not spare one more register for the
  TCG code, and it will emit longer TCG code to regularly reload the env
  global into a host register.

 But there will be one more register available in some cases. In other

 Inside the TCG code, it will basically happens very rarely, given
 load/store are really the most used instructions, and they need to load
 the env register.

 Not exactly, from a sample run with -d op_opt:
 $ egrep -v -e '^$' -v -e 'OP after' -v -e ' end' -v -e 'Search PC'
 /tmp/qemu.log | awk '{print $1}' | sort | uniq -c|sort -rn
 1673966 movi_i32
  653931 ld_i32
  607432 mov_i32
  428684 st_i32
  326878 movi_i64
  308626 add_i32
  283186 call
  256817 exit_tb
  207232 nopn
  189388 goto_tb
  122398 and_i32
  117997 shr_i32
  89107 qemu_ld32
  82926 set_label
  82713 brcond_i32
  67169 qemu_st32
  55109 or_i32
  46536 ext32u_i64
  44288 xor_i32
  38103 sub_i32
  26361 shl_i32
  23218 shl_i64
  23218 qemu_st64
  23218 or_i64
  20474 shr_i64
  20445 qemu_ld64
  11161 qemu_ld8u
  10409 qemu_st8
   5013 qemu_ld16u
   3795 qemu_st16
   2776 qemu_ld8s
   1915 sar_i32
   1414 qemu_ld16s
    839 not_i32
    579 setcond_i32
    213 br
     42 ext32s_i64
     30 mul_i64

Unless I missed something, this doesn't show the usage of
ld/st per TB, which is what Aurélien was looking for if I
understood correctly.  All I can say is that you had at
most 256817 TB's and 234507 qemu_ld/st, so about one per
TB.

Anyway I must be thick, because I fail to see how
generated code could access guest CPU registers without a
pointer to the CPU env :-)

IIUC the SPARC translator uses ld_i32/st_i32 mainly for
accessing the guest CPU registers, which due to register
windows is held in a dedicated global temp.  Is that
correct?  If so this is kind of hiding accesses to the
CPU env;  all other targets read/write registers by using
CPU env (through the use global temps in most cases).

So I think most (if not almost all) TB will need a pointer
to CPU env, which is why I think Aurélien's proposal to
keep a dedicated register that'd be loaded in the prologue
is the only way to not degrade performance of the
generated code (I'd add that this dedicated register
should be the one defined by the ABI as holding the first
parameter value, if that's possible;  I'm afraid this is
not necessarily a good idea).


Laurent



Re: [Qemu-devel] [PATCH RFC 00/11] AREG0 elimination

2011-05-15 Thread Laurent Desnogues
On Sun, May 15, 2011 at 1:33 PM, Blue Swirl blauwir...@gmail.com wrote:
[...]
 x86_64 uses r14 as TCG_AREG0. Despite the instructions being quite
 simple (only 2 movi_i32), the resulting code makes 2 access to env to
 save the two registers. Having to reload the env pointer each time to a
 register would clearly increase the size of this TB.

 I don't think TCG would be that simple, instead the pointer would be
 loaded only once in this case.

Assuming TCG was able to allocate a register for that,
it would be live at most for one TB, so you'd have to
load it at least once per TB, and with block chaining
that wouldn't be efficient as you'd keep on reloading it.


Laurent



Re: [Qemu-devel] tcg/tcg.c:1892: tcg fatal error

2011-04-21 Thread Laurent Desnogues
On Thu, Apr 21, 2011 at 4:57 PM, Artyom Tarasenko atar4q...@gmail.com wrote:
 On Tue, Apr 12, 2011 at 4:14 AM, Igor Kovalenko
 igor.v.kovale...@gmail.com wrote:
 Do you have public test case?
 It is possible to code this delay slot write test but real issue may
 be corruption elsewhere.

 The test case is trivial: it's just the two instructions, branch and wrpr.

 In theory there could be multiple issues including compiler induced ones.
 I'd prefer to see some kind of reproducible testcase.

 Ok, attached a 40 byte long test (the first 32 bytes are not used and
 needed only because the bios entry point is 0x20).

 $ git pull  make  sparc64-softmmu/qemu-system-sparc64 -bios
 test-wrpr.bin -nographic
 Already up-to-date.
 make[1]: Nothing to be done for `all'.
 /mnt/terra/projects/vanilla/qemu/tcg/tcg.c:1892: tcg fatal error
 Aborted

The problem seems to be that wrpr is using a non-local
TCG tmp (cpu_tmp0).


Laurent



Re: [Qemu-devel] tcg/tcg.c:1892: tcg fatal error

2011-04-21 Thread Laurent Desnogues
On Thu, Apr 21, 2011 at 9:45 PM, Igor Kovalenko
igor.v.kovale...@gmail.com wrote:
 On Thu, Apr 21, 2011 at 7:44 PM, Laurent Desnogues
 laurent.desnog...@gmail.com wrote:
 On Thu, Apr 21, 2011 at 4:57 PM, Artyom Tarasenko atar4q...@gmail.com 
 wrote:
 On Tue, Apr 12, 2011 at 4:14 AM, Igor Kovalenko
 igor.v.kovale...@gmail.com wrote:
 Do you have public test case?
 It is possible to code this delay slot write test but real issue may
 be corruption elsewhere.

 The test case is trivial: it's just the two instructions, branch and wrpr.

 In theory there could be multiple issues including compiler induced ones.
 I'd prefer to see some kind of reproducible testcase.

 Ok, attached a 40 byte long test (the first 32 bytes are not used and
 needed only because the bios entry point is 0x20).

 $ git pull  make  sparc64-softmmu/qemu-system-sparc64 -bios
 test-wrpr.bin -nographic
 Already up-to-date.
 make[1]: Nothing to be done for `all'.
 /mnt/terra/projects/vanilla/qemu/tcg/tcg.c:1892: tcg fatal error
 Aborted

 The problem seems to be that wrpr is using a non-local
 TCG tmp (cpu_tmp0).

 Just tried the test case with write to %pil - seems like write itself is OK.
 The issue appears to be with save_state() call since adding save_state
 to %pil case provokes the same tcg abort.

The problem is that cpu_tmp0, not being a local tmp, doesn't
need to be saved across helper calls.  This results in the
TCG optimizer getting rid of it even though it's later used.
Look at the log and you'll see what I mean :-)


Laurent



[Qemu-devel] Re: QEMU regression problems - Update FPU

2011-02-24 Thread Laurent Desnogues
On Thu, Feb 24, 2011 at 12:10 PM, Paolo Bonzini pbonz...@redhat.com wrote:
 On 02/23/2011 08:04 PM, Aurelien Jarno wrote:

 Actually that's the reason why i386 doesn't use softfloat, as all the
 trigonometric use libm, and the bridge between softfloat and libm is not
 working correctly (plenty of type abuse).

 Besides, I doubt softfloat would want bug-compatible trig function
 implementations.  fsincos for example is a far cry from the precision of the
 libm function.

I agree, these functions certainly don't belong to SoftFloat.
They should be implemented in target-i386/op_helper.c.  But
bit accuracy might be difficult to achieve, unless Intel/AMD
properly documented how they compute all these functions.


Laurent



Re: [Qemu-devel] Re: QEMU regression problems - Update FPU

2011-02-23 Thread Laurent Desnogues
On Wed, Feb 23, 2011 at 9:16 AM, Peter Maydell peter.mayd...@linaro.org wrote:
 On 18 February 2011 07:12, Gerhard Wiesinger li...@wiesinger.com wrote:
 Issue 1.) with FPU still present
 I tracked down the problematic code and it is a rounding error from double
 precision to 64bit floats: Any ideas how to fix such an issue in general?

 QEMU result in ST0: 0.42925860786976457 (wrong emulated)
 KVM result in ST0:  0.42925860786975449 (correct)

 This is an error when running QEMU in TCG mode, right?
 At the moment x86 is the odd-one-out in that it doesn't
 use CONFIG_SOFTFLOAT for its FPU emulation, so somebody
 has made an explicit choice of preferring speed over
 accuracy, and I am unsurprised that there are rounding
 errors as a result.

Even if you were using SoftFloat, you'd probably still get wrong
results given that this test seems to test trigonometric instructions
which aren't implemented in SoftFloat, but are relying on libm.


Laurent



Re: [Qemu-devel] QEMU license problem (was [PATCH v3] Drop --whole-archive and static libraries)

2010-03-07 Thread Laurent Desnogues
On Sun, Mar 7, 2010 at 2:53 PM, Anthony Liguori anth...@codemonkey.ws wrote:
[...]

 Removing libqemu.a was technically ok, but throws a license problem:


 Every file contains a copyright/license.  The statement in the top level is
 just a statement of intention.  It's to make sure that people do not
 introduce GPL'd code into libqemu.a.

./configure --target-list=i386-linux-user,i386-softmmu
make -j7

And:

find . -name 'lib*a'

finds nothing, so how can you say what is supposed to be
in libqemu?


Laurent




Re: [Qemu-devel] forking i386 binaries on arm linux user mode

2010-02-11 Thread Laurent Desnogues
On Wed, Feb 10, 2010 at 10:38 PM, Damion Yates dam...@google.com wrote:
[...]
 Should clone()/fork() work?  Has anyone been able to run wine ./blah.exe
 under user-linux mode of qemu on arm or indeed any other non x86 based
 CPU ?

I forgot to mention NPTL is not supported for x86 which
will be an issue.


Laurent




Re: [Qemu-devel] forking i386 binaries on arm linux user mode

2010-02-10 Thread Laurent Desnogues
On Wed, Feb 10, 2010 at 10:38 PM, Damion Yates dam...@google.com wrote:
 I've grabbed the latest stable qemu and compiled under scratchbox.  I
 hit an issue compiling it, with no __builtin__clear_cache() so linked in
 a kludge.c containing a call to __clear_cache() with the params passed
 as they would be to __builtin__clear_cache().

 Firstly does this sound like it should work as a workaround?

That's how I did it (except that I replaced the calls to
__builtin__clear_cache with calls to __clear_cache in
tcg/arm/tcg-target.h and exec-all.h.

 It certainly got me to the next level, which is that I can now run loads
 of linux binaries on my armlinux system (a Nokia n900). I've tried tower
 toppler (http://toppler.sourceforge.net/) which uses SDL (via X11) and
 this was surprisingly fast, in fact it almost felt as fast as the native
 toppler that somebody crosscompiled already. Most linux utils work when
 I copy then and any dependant libs from my x86 laptop to the phone. I'm
 lucky (I guess) that /lib/ld-linux.so.3 is the arm version and I'm using
 a slightly older .2 for x86 so I can have both files there. I also
 enabled arbitrary execution of binaries via binfmt_misc. The 600 Mhz Arm
 V8 Cortex (I think it is), feels like it's running at about Pentium 90
 speeds, which I'm hoping is enough for what I really want to get going.

The N900 has an ARMv7 Cortex-A8 (OMAP3).

P90 is about the speed I'd expect.  Note it will seem
much slower if your program makes heavy use of floating
point.

 I want to run an old, possibly win16 Windows game under wine. I saw that
 user mode qemu-i386 was able to run wine in a post in 2004:
 http://lists.terrasoftsolutions.com/pipermail/yellowdog-general/2004-June/014468.html
  - This was on a PPC however.

 When I run wine it SEGVs out and the strace of it shows it dies trying
 to do clone(). I also can't run things like xterm which can't do fork().
 Is this because by default it's trying to go via the arm /bin/sh to
 invoke whatever it wants to exec() in to?

 Should clone()/fork() work?  Has anyone been able to run wine ./blah.exe
 under user-linux mode of qemu on arm or indeed any other non x86 based
 CPU ?

I have seen a complex Linux i386 program run on an
ARM platform (with multithreading and Qt;  note it
crashed at some point but it was due to the
application doing access to uninitialized memory).
IIRC it was using chroot from inside QEMU (in main)
to make sure that all calls to exec would happen on
an x86 file system.  You might give that a try.


Laurent




Re: [Qemu-devel] [PATCH 4/4] target-arm: neon fix

2010-02-08 Thread Laurent Desnogues
On Mon, Feb 8, 2010 at 12:47 PM, Riku Voipio riku.voi...@iki.fi wrote:
 On Sun, Feb 07, 2010 at 02:02:31PM +0100, Laurent Desnogues wrote:
 On Sun, Feb 7, 2010 at 1:54 PM, Laurent Desnogues
 laurent.desnog...@gmail.com wrote:
  On Fri, Feb 5, 2010 at 4:52 PM, Riku Voipio riku.voi...@iki.fi wrote:
  From: Juha Riihimäki juha.riihim...@nokia.com

  add an extra check in two registers and a shift to ensure element
  size decoding logic cannot fail.

  I think there's a patch ordering problem that makes
  the comment and the change not agree :-)

 Sorry, apparently messed up while rebasing.

 BTW I don't think adding the check for size is needed
 here.  The encoding at that point looks like this:

  332211
  10987654321098765432109876543210
  001_1___1__1
  001_1__1___1
  001_1_11

 so it will stop for size == 0 given that bit 19 will have to
 be set.

 Juha agrees so we'll drop this patch (or more precisely get the actual change
 out of the previous patch..)

Do you intend to resend the patch 3 of this set or should it
be reviewed as is?

Thanks,

Laurent




Re: [Qemu-devel] [PATCH] User mode: Handle x86_64 vsyscall

2010-02-07 Thread Laurent Desnogues
On Sun, Feb 7, 2010 at 1:22 AM, Jamie Lokier ja...@shareable.org wrote:
[...]

 How would you achieve that?  Your guest OS
 doesn't necessarily have the code mapped.  I
 think this has to be considered as other syscalls,
 though slightly different.

 There is no guest OS when doing -user emulation.
 Only qemu.

I meant that the vsyscall page doesn't exist on
other guest systems but x86_64 running Linux.
So if one wants to have it somehow mapped
then it would have to be installed by QEMU,
and QEMU can't install such a page due to
limitations in the way it handles virtual
addresses.

  My favorite solution would be a vsyscall page mapped
  to the correct fixed address and filled with QEMU
  generated specific code, for example code which calls the
  normal syscalls to do the work. This would only
  need modifications for linux-user code.

 You mean you'd explicitly put somewhere x86_64
 code that simulates the behaviour of vsyscall?

 That seems like a good idea to me.

Why not indeed.  But someone will first have
to fix virtual memory management.


Laurent




Re: [Qemu-devel] [PATCH 1/4] target-arm: neon - fix VRADDHN/VRSUBHN vs VADDHN/VSUBHN

2010-02-07 Thread Laurent Desnogues
On Fri, Feb 5, 2010 at 4:52 PM, Riku Voipio riku.voi...@iki.fi wrote:
 From: Riku Voipio riku.voi...@nokia.com

 The rounding/truncating options were inverted. truncating
 was done when rounding was meant and vice verse.

 Signed-off-by: Riku Voipio riku.voi...@nokia.com

Acked-by: Laurent Desnogues laurent.desnog...@gmail.com


Laurent

 ---
  target-arm/translate.c |    4 ++--
  1 files changed, 2 insertions(+), 2 deletions(-)

 diff --git a/target-arm/translate.c b/target-arm/translate.c
 index 5cf3e06..4bd813a 100644
 --- a/target-arm/translate.c
 +++ b/target-arm/translate.c
 @@ -4957,7 +4957,7 @@ static int disas_neon_data_insn(CPUState * env, 
 DisasContext *s, uint32_t insn)
                     case 0: case 1: case 4: /* VADDL, VADDW, VADDHN, VRADDHN 
 */
                         gen_neon_addl(size);
                         break;
 -                    case 2: case 3: case 6: /* VSUBL, VSUBW, VSUBHL, VRSUBHL 
 */
 +                    case 2: case 3: case 6: /* VSUBL, VSUBW, VSUBHN, VRSUBHN 
 */
                         gen_neon_subl(size);
                         break;
                     case 5: case 7: /* VABAL, VABDL */
 @@ -5026,7 +5026,7 @@ static int disas_neon_data_insn(CPUState * env, 
 DisasContext *s, uint32_t insn)
                     } else if (op == 4 || op == 6) {
                         /* Narrowing operation.  */
                         tmp = new_tmp();
 -                        if (u) {
 +                        if (!u) {
                             switch (size) {
                             case 0:
                                 gen_helper_neon_narrow_high_u8(tmp, cpu_V0);
 --
 1.6.5








Re: [Qemu-devel] [PATCH 2/4] target-arm: neon vshll instruction fix

2010-02-07 Thread Laurent Desnogues
On Fri, Feb 5, 2010 at 4:52 PM, Riku Voipio riku.voi...@iki.fi wrote:
 From: Juha Riihimäki juha.riihim...@nokia.com

 implementation only widened the 32bit source vector elements into a
 64bit destination vector but forgot to perform the actual shifting
 operation.

 Signed-off-by: Juha Riihimäki juha.riihim...@nokia.com
 Signed-off-by: Riku Voipio riku.voi...@nokia.com

Acked-by: Laurent Desnogues laurent.desnog...@gmail.com


Laurent

 ---
  target-arm/translate.c |    1 +
  1 files changed, 1 insertions(+), 0 deletions(-)

 diff --git a/target-arm/translate.c b/target-arm/translate.c
 index 4bd813a..537d9d6 100644
 --- a/target-arm/translate.c
 +++ b/target-arm/translate.c
 @@ -5385,6 +5385,7 @@ static int disas_neon_data_insn(CPUState * env, 
 DisasContext *s, uint32_t insn)
                         if (pass == 1)
                             tmp = tmp2;
                         gen_neon_widen(cpu_V0, tmp, size, 1);
 +                        tcg_gen_shli_i64(cpu_V0, cpu_V0, 8  size);
                         neon_store_reg64(cpu_V0, rd + pass);
                     }
                     break;
 --
 1.6.5








Re: [Qemu-devel] [PATCH 4/4] target-arm: neon fix

2010-02-07 Thread Laurent Desnogues
On Fri, Feb 5, 2010 at 4:52 PM, Riku Voipio riku.voi...@iki.fi wrote:
 From: Juha Riihimäki juha.riihim...@nokia.com

 add an extra check in two registers and a shift to ensure element
 size decoding logic cannot fail.

 Signed-off-by: Juha Riihimäki juha.riihim...@nokia.com
 Signed-off-by: Riku Voipio riku.voi...@nokia.com
 ---
  target-arm/translate.c |    3 ++-
  1 files changed, 2 insertions(+), 1 deletions(-)

 diff --git a/target-arm/translate.c b/target-arm/translate.c
 index 743b846..8bba034 100644
 --- a/target-arm/translate.c
 +++ b/target-arm/translate.c
 @@ -4567,8 +4567,9 @@ static int disas_neon_data_insn(CPUState * env, 
 DisasContext *s, uint32_t insn)
                 size = 3;
             } else {
                 size = 2;
 -                while (size  (insn  (1  (size + 19))) == 0)
 +                while (size  (insn  (1  (size + 19))) == 0) {
                     size--;
 +                }
             }
             shift = (insn  16)  ((1  (3 + size)) - 1);
             /* To avoid excessive dumplication of ops we implement shift

I think there's a patch ordering problem that makes
the comment and the change not agree :-)


Laurent




Re: [Qemu-devel] [PATCH 4/4] target-arm: neon fix

2010-02-07 Thread Laurent Desnogues
On Sun, Feb 7, 2010 at 1:54 PM, Laurent Desnogues
laurent.desnog...@gmail.com wrote:
 On Fri, Feb 5, 2010 at 4:52 PM, Riku Voipio riku.voi...@iki.fi wrote:
 From: Juha Riihimäki juha.riihim...@nokia.com

 add an extra check in two registers and a shift to ensure element
 size decoding logic cannot fail.

 Signed-off-by: Juha Riihimäki juha.riihim...@nokia.com
 Signed-off-by: Riku Voipio riku.voi...@nokia.com
 ---
  target-arm/translate.c |    3 ++-
  1 files changed, 2 insertions(+), 1 deletions(-)

 diff --git a/target-arm/translate.c b/target-arm/translate.c
 index 743b846..8bba034 100644
 --- a/target-arm/translate.c
 +++ b/target-arm/translate.c
 @@ -4567,8 +4567,9 @@ static int disas_neon_data_insn(CPUState * env, 
 DisasContext *s, uint32_t insn)
                 size = 3;
             } else {
                 size = 2;
 -                while (size  (insn  (1  (size + 19))) == 0)
 +                while (size  (insn  (1  (size + 19))) == 0) {
                     size--;
 +                }
             }
             shift = (insn  16)  ((1  (3 + size)) - 1);
             /* To avoid excessive dumplication of ops we implement shift

 I think there's a patch ordering problem that makes
 the comment and the change not agree :-)

BTW I don't think adding the check for size is needed
here.  The encoding at that point looks like this:

 332211
 10987654321098765432109876543210
 001_1___1__1
 001_1__1___1
 001_1_11

so it will stop for size == 0 given that bit 19 will have to
be set.


Laurent




Re: [Qemu-devel] [PATCH] User mode: Handle x86_64 vsyscall

2010-02-06 Thread Laurent Desnogues
On Sat, Feb 6, 2010 at 8:49 AM, Stefan Weil w...@mail.berlios.de wrote:
[...]
 I tested two different hosts with x86_64-linux-user:

 * 32 bit Intel (i386) - does not work with your patch

For me x86_64 on i386 has always failed without
even calling vsyscall :-)

 * 64 bit AMD (x86_64)  - works with your patch

 Your patch improves the emulation for 64 bit hosts.
 Nevertheless, it has some open points:

 * target-i386 code should not have to know about
  linux vsyscall

Given that we have to workaround 64-bit virtual
address limitations (cf. Richard mail and previous
discussions on the list), doing otherwise looks
difficult.

 * there is no vsyscall page in memory,
  but very special programs might expect to see one
  (it is even worse: the target sees the memory page
  of the host)

 * it is not possible to step into vsyscall code
  using a debugger

How would you achieve that?  Your guest OS
doesn't necessarily have the code mapped.  I
think this has to be considered as other syscalls,
though slightly different.

 My favorite solution would be a vsyscall page mapped
 to the correct fixed address and filled with QEMU
 generated specific code, for example code which calls the
 normal syscalls to do the work. This would only
 need modifications for linux-user code.

You mean you'd explicitly put somewhere x86_64
code that simulates the behaviour of vsyscall?


Laurent




Re: [Qemu-devel] [PATCH] User mode: Handle x86_64 vsyscall

2010-02-05 Thread Laurent Desnogues
On Fri, Feb 5, 2010 at 11:57 PM, Stefan Weil w...@mail.berlios.de wrote:
 Laurent Desnogues schrieb:
[...]

 I'm still struggling with bntest and other x86_64-linux-user software
 calling any of the vsyscall functions.

 Laurent, your vsyscall patch only works on x86_64 hosts.

 A lot of software calls time() which uses vsyscall on x86_64 which
 does not work with x86_64-linux-user mode.

I'm not sure I understand what you mean.  Did you try
on some other host and it failed?  Was your host
32-bit?  If so, I'm afraid user-mode will fail for more
reasons than vsyscall.

 So the status of x86_64-linux-user is not more than experimental :-(

 I tried to modify x86_64-linux-user to set up a vsyscall page in high
 memory,
 but this seems to be difficult (at least with 32 bit host).

 Any hints how this should be done are welcome.

My patch explicitly prevents the linking of the vsyscall
page.

Could you provide more info about your host?


Laurent




Re: [Qemu-devel] Adding support for MIPS64 as host

2010-02-04 Thread Laurent Desnogues
On Thu, Feb 4, 2010 at 6:26 PM, Utkarsh Sopan utkarsh.so...@coe.dce.edu wrote:

 Can you tell me what is the status of adding MIPS64 support?
 as at the wiki page it showed Red earlier.

Look here: http://git.aurel32.net/?p=qemu.git;a=summary


Laurent




Re: [Qemu-devel] Need help to run application on QEMU

2010-02-03 Thread Laurent Desnogues
On Wed, Feb 3, 2010 at 11:10 AM, Taimoor Mirza mooni_mi...@hotmail.com wrote:
 Hi all,

 I have been trying to use VGA card and LCD for integrator 926. I've built my
 kernel PLUS with graphics support. I used following command to run my PLUS
 application on qemu:

 qemu-system-arm -M integratorcp -cpu arm926 -kernel ./graphics_demo.out

 I am getting following error:


 qemu: fatal: integratorcm_read: Unimplemented offset 0x54

 R00=1000 R01= R02= R03=1000
 R04=0014 R05=0400 R06=0038e054 R07=
 R08= R09=003b06ec R10= R11=
 R12=00390814 R13=003b06cc R14=00086148 R15=00085eec
 PSR=801f N--- A sys32
 s00=(   0) s01=(   0) d00=(   0)
 s02=(   0) s03=(   0) d01=(   0)
 s04=(   0) s05=(   0) d02=(   0)
 s06=(   0) s07=(   0) d03=(   0)
 s08=(   0) s09=(   0) d04=(   0)
 s10=(   0) s11=(   0) d05=(   0)
 s12=(   0) s13=(   0) d06=(   0)
 s14=(   0) s15=(   0) d07=(   0)
 s16=(   0) s17=(   0) d08=(   0)
 s18=(   0) s19=(   0) d09=(   0)
 s20=(   0) s21=(   0) d10=(   0)
 s22=(   0) s23=(   0) d11=(   0)
 s24=(   0) s25=(   0) d12=(   0)
 s26=(   0) s27=(   0) d13=(   0)
 s28=(   0) s29=(   0) d14=(   0)
 s30=(   0) s31=(   0) d15=(   0)
 FPSCR: 

 This application has requested the Runtime to terminate it in an unusual
 way.
 Please contact the application's support team for more information.

 Can anyone help me?

Your software looks faulty the core module register at offset 0x54
is write only, cf.
http://infocenter.arm.com/help/topic/com.arm.doc.dui0138e/DUI0138E_CMxx6_UserGuide.pdf
Table 4-3 page 4-14.


Laurent




Re: [Qemu-devel] Adding support for MIPS64 as host

2010-02-03 Thread Laurent Desnogues
On Wed, Feb 3, 2010 at 5:53 PM, Utkarsh Sopan utkarsh.so...@coe.dce.edu wrote:
 I am trying to add support for MIPS 64 as Host machine as my academic
 project.

Note that some work has already been done for MIPS host
support.

 I am new to QEMU.

 Problem is I dont have a MIPS 64 machine to test on.
 Please suggest whether or not I can use a nested QEMU emulator to test the
 same.
 i.e. running my version on top of a simulated MIPS64 machine.

 Is it possible to do so?

I was able to run on an x86_64 host, an ARM-hosted QEMU simulating
x86.  That was for Linux user mode.  I can't say if that would work for
other targets or for system simulation.


Laurent




Re: [Qemu-devel] [BUG] qemu-x86_64 crash when running bntest (was: [PATCH] Porting TCG to alpha platform)

2010-01-30 Thread Laurent Desnogues
On Sat, Jan 30, 2010 at 10:30 AM, Stefan Weil w...@mail.berlios.de wrote:
 Laurent Desnogues schrieb:
 On Sat, Jan 30, 2010 at 12:04 AM, Stefan Weil w...@mail.berlios.de
 wrote:
 [...]
 that was a good suggestion. bntest raises a segmentation fault
 (NULL pointer, p == 0, see below) with qemu-x86_64 on a x86_64 host.

 Compile bntest statically and it should work.

 x86_64 user mode is completely broken for dynamically
 linked programs.


 Laurent


 A statically linked bntest results in a crash, too.

 Stefan


 $ ldd bntest
        not a dynamic executable
 $ x86/x86_64-linux-user/qemu-x86_64 ./bntest
 ERROR: ioctl(SNDCTL_DSP_MAPINBUF): target=0x80085013 host=0x80105013
 ERROR: ioctl(SNDCTL_DSP_MAPOUTBUF): target=0x80085014 host=0x80105014
 obase=16
 ibase=16
 test BN_add
 print test BN_add\n
 qemu: uncaught target signal 11 (Segmentation fault) - core dumped

That worked for me. Could show us the last translated TB?


Laurent




Re: [Qemu-devel] [BUG] qemu-x86_64 crash when running bntest (was: [PATCH] Porting TCG to alpha platform)

2010-01-30 Thread Laurent Desnogues
On Sat, Jan 30, 2010 at 10:30 AM, Stefan Weil w...@mail.berlios.de wrote:
 Laurent Desnogues schrieb:
 On Sat, Jan 30, 2010 at 12:04 AM, Stefan Weil w...@mail.berlios.de
 wrote:
 [...]
 that was a good suggestion. bntest raises a segmentation fault
 (NULL pointer, p == 0, see below) with qemu-x86_64 on a x86_64 host.

 Compile bntest statically and it should work.

 x86_64 user mode is completely broken for dynamically
 linked programs.


 Laurent


 A statically linked bntest results in a crash, too.

OK, I found the issue.  It's vsyscall.  You should use the patch
I sent to this list a few months ago.


Laurent




Re: [Qemu-devel] [PATCH] Porting TCG to alpha platform

2010-01-29 Thread Laurent Desnogues
On Sat, Jan 30, 2010 at 12:04 AM, Stefan Weil w...@mail.berlios.de wrote:
[...]

 that was a good suggestion. bntest raises a segmentation fault
 (NULL pointer, p == 0, see below) with qemu-x86_64 on a x86_64 host.

Compile bntest statically and it should work.

x86_64 user mode is completely broken for dynamically
linked programs.


Laurent




Re: [Qemu-devel] Merge qemu android

2010-01-28 Thread Laurent Desnogues
On Thu, Jan 28, 2010 at 11:44 AM, Bastien ROUCARIES
roucaries.bast...@gmail.com wrote:
 Forget to cc

 On Sat, Jan 23, 2010 at 11:40 PM, Anthony Liguori anth...@codemonkey.ws
 wrote:

 On 01/21/2010 10:27 AM, Bastien ROUCARIES wrote:

 Hi,

 What is the step in order to get qemu android merged mainline ?

 http://android.git.kernel.org/?p=platform/external/qemu.git;a=summary
[...]

 I know but the code base is really different they use 0.8.2 as a base, and i
 was thinking of porting to upstream.

Really?  The tree you point to above has TCG.  Or is it unused?

 They use also craps like sdl :S

Mainline QEMU does too.


Laurent




Re: [Qemu-devel] [PATCH 4/4] target-arm: refactor cp15.c13 register access

2010-01-27 Thread Laurent Desnogues
On Wed, Jan 27, 2010 at 1:49 PM, Riku Voipio riku.voi...@iki.fi wrote:
 From: Riku Voipio riku.voi...@nokia.com

 Access the cp15.c13 TLS registers directly with TCG ops instead of with
 a slow helper. If the the cp15 read/write was not TLS register access,
 fall back to the cp15 helper.

 This makes accessing __thread variables in linux-user when apps are compiled
 with -mtp=cp15 possible. legal cp15 register to acces from linux-user are
 already checked in cp15_user_ok.

 While at it, make the cp15.c13 Thread ID registers available only on
 ARMv6K and newer.

 Signed-off-by: Riku Voipio riku.voi...@nokia.com

Acked-by: Laurent Desnogues laurent.desnog...@gmail.com


Laurent

 ---
  target-arm/helper.c    |   16 --
  target-arm/translate.c |   55 
 
  2 files changed, 55 insertions(+), 16 deletions(-)

 diff --git a/target-arm/helper.c b/target-arm/helper.c
 index b3aec99..27001e8 100644
 --- a/target-arm/helper.c
 +++ b/target-arm/helper.c
 @@ -511,7 +511,6 @@ void HELPER(set_cp15)(CPUState *env, uint32_t insn, 
 uint32_t val)
  uint32_t HELPER(get_cp15)(CPUState *env, uint32_t insn)
  {
     cpu_abort(env, cp15 insn %08x\n, insn);
 -    return 0;
  }

  /* These should probably raise undefined insn exceptions.  */
 @@ -1491,15 +1490,6 @@ void HELPER(set_cp15)(CPUState *env, uint32_t insn, 
 uint32_t val)
               tlb_flush(env, 0);
             env-cp15.c13_context = val;
             break;
 -        case 2:
 -            env-cp15.c13_tls1 = val;
 -            break;
 -        case 3:
 -            env-cp15.c13_tls2 = val;
 -            break;
 -        case 4:
 -            env-cp15.c13_tls3 = val;
 -            break;
         default:
             goto bad_reg;
         }
 @@ -1779,12 +1769,6 @@ uint32_t HELPER(get_cp15)(CPUState *env, uint32_t insn)
             return env-cp15.c13_fcse;
         case 1:
             return env-cp15.c13_context;
 -        case 2:
 -            return env-cp15.c13_tls1;
 -        case 3:
 -            return env-cp15.c13_tls2;
 -        case 4:
 -            return env-cp15.c13_tls3;
         default:
             goto bad_reg;
         }
 diff --git a/target-arm/translate.c b/target-arm/translate.c
 index 5cf3e06..786c329 100644
 --- a/target-arm/translate.c
 +++ b/target-arm/translate.c
 @@ -2455,6 +2455,57 @@ static int cp15_user_ok(uint32_t insn)
     return 0;
  }

 +static int cp15_tls_load_store(CPUState *env, DisasContext *s, uint32_t 
 insn, uint32_t rd)
 +{
 +    TCGv tmp;
 +    int cpn = (insn  16)  0xf;
 +    int cpm = insn  0xf;
 +    int op = ((insn  5)  7) | ((insn  18)  0x38);
 +
 +    if (!arm_feature(env, ARM_FEATURE_V6K))
 +        return 0;
 +
 +    if (!(cpn == 13  cpm == 0))
 +        return 0;
 +
 +    if (insn  ARM_CP_RW_BIT) {
 +        tmp = new_tmp();
 +        switch (op) {
 +        case 2:
 +            tcg_gen_ld_i32(tmp, cpu_env, offsetof(CPUARMState, 
 cp15.c13_tls1));
 +            break;
 +        case 3:
 +            tcg_gen_ld_i32(tmp, cpu_env, offsetof(CPUARMState, 
 cp15.c13_tls2));
 +            break;
 +        case 4:
 +            tcg_gen_ld_i32(tmp, cpu_env, offsetof(CPUARMState, 
 cp15.c13_tls3));
 +            break;
 +        default:
 +            dead_tmp(tmp);
 +            return 0;
 +        }
 +        store_reg(s, rd, tmp);
 +
 +    } else {
 +        tmp = load_reg(s, rd);
 +        switch (op) {
 +        case 2:
 +            tcg_gen_st_i32(tmp, cpu_env, offsetof(CPUARMState, 
 cp15.c13_tls1));
 +            break;
 +        case 3:
 +            tcg_gen_st_i32(tmp, cpu_env, offsetof(CPUARMState, 
 cp15.c13_tls2));
 +            break;
 +        case 4:
 +            tcg_gen_st_i32(tmp, cpu_env, offsetof(CPUARMState, 
 cp15.c13_tls3));
 +            break;
 +        default:
 +            return 0;
 +        }
 +        dead_tmp(tmp);
 +    }
 +    return 1;
 +}
 +
  /* Disassemble system coprocessor (cp15) instruction.  Return nonzero if
    instruction is not defined.  */
  static int disas_cp15_insn(CPUState *env, DisasContext *s, uint32_t insn)
 @@ -2489,6 +2540,10 @@ static int disas_cp15_insn(CPUState *env, DisasContext 
 *s, uint32_t insn)
         return 0;
     }
     rd = (insn  12)  0xf;
 +
 +    if (cp15_tls_load_store(env, s, insn, rd))
 +        return 0;
 +
     tmp2 = tcg_const_i32(insn);
     if (insn  ARM_CP_RW_BIT) {
         tmp = new_tmp();
 --
 1.6.5








Re: [Qemu-devel] [PATCH 4/5] linux-user: Add access to TLS registers

2010-01-26 Thread Laurent Desnogues
On Tue, Jan 26, 2010 at 5:00 PM, Riku Voipio riku.voi...@iki.fi wrote:
 From: Riku Voipio riku.voi...@nokia.com

 If you compile applications with gcc -mtp=cp15, __thread
 access's will generate an abort. Implement accessing allowed
 cp15.c13 registers on ARMv6K+ in linux-user.

 Signed-off-by: Riku Voipio riku.voi...@nokia.com
 ---
  target-arm/helper.c |   27 ++-
  1 files changed, 26 insertions(+), 1 deletions(-)

 diff --git a/target-arm/helper.c b/target-arm/helper.c
 index b3aec99..68578ce 100644
 --- a/target-arm/helper.c
 +++ b/target-arm/helper.c
 @@ -505,13 +505,38 @@ uint32_t HELPER(get_cp)(CPUState *env, uint32_t insn)

  void HELPER(set_cp15)(CPUState *env, uint32_t insn, uint32_t val)
  {
 +    int op2;
 +
 +    op2 = (insn  5)  7;
 +    /* Allow write access to CP15 User RW Thread ID Register */
 +    if (arm_feature (env, ARM_FEATURE_V6K)  ((insn  16)  0xf) == 13) {
 +        switch (op2) {
 +        case 2:
 +            env-cp15.c13_tls1 = val;
 +            return;
 +        }
 +    }
     cpu_abort(env, cp15 insn %08x\n, insn);
  }

  uint32_t HELPER(get_cp15)(CPUState *env, uint32_t insn)
  {
 +    int op2;
 +    /* Allow read access to CP15 User RW and RO Thread ID Registers */
 +
 +    op2 = (insn  5)  7;
 +    if (arm_feature (env, ARM_FEATURE_V6K)  ((insn  16)  0xf) == 13) {
 +        switch (op2) {
 +        case 2:
 +            return env-cp15.c13_tls1;
 +        case 3:
 +            return env-cp15.c13_tls2;
 +        default:
 +            goto bad_reg;
 +        }
 +    }
 +bad_reg:
     cpu_abort(env, cp15 insn %08x\n, insn);
 -    return 0;
  }

  /* These should probably raise undefined insn exceptions.  */

Most of the checks you do here could be done in translate.c.
Wouldn't it be better to do them there?


Laurent




<    1   2   3   4   5   >