Re: [Qemu-devel] [PATCH v2 02/12] ARM: Prepare translation for AArch64 code
On Wednesday, May 1, 2013, Richard Henderson r...@twiddle.net wrote: On 2013-04-30 07:36, John Rigby wrote: uint32_t regs[16]; + +/* Regs for A64 mode. */ +uint64_t xregs[31]; +uint64_t pc; +uint64_t sp; +uint32_t pstate; +uint32_t aarch64_state; /* 1 if CPU is in aarch64 state */ + How do these registers overlap (or not) in real hardware? Is it possible to union these with the 32-bit state? There is an overlap between 32- and 64-bit state registers, but it's against the set of 32 (?) 32-bit registers that exist across all modes, not against the 16 registers as used here. IMHO it doesn't make sense to union them, the mapping can be done when switching from 32- to 64-bit modes. Laurent
Re: [Qemu-devel] [PATCH v2 03/12] ARM: Add AArch64 translation stub
On Tue, Apr 30, 2013 at 8:36 AM, John Rigby john.ri...@linaro.org wrote: From: Alexander Graf ag...@suse.de We should translate AArch64 mode separately from AArch32 mode. In AArch64 mode, registers look vastly different, instruction encoding is completely different, basically the system turns into a different machine. So let's do a simple if() in translate.c to decide whether we can handle the current code in the legacy AArch32 code or in the new AArch64 code. So far, the translation always complains about unallocated instructions. There is no emulator functionality in this patch! Signed-off-by: Alexander Graf ag...@suse.de --- Changes in v2: - Remove uses of is_a64 that are not needed because arch choice happens at a higher level. - aarch64 register arrays now only have 31 entries with sp and xzr treated as special cases. +if (is_a64(env)) { target-arm/Makefile.objs |1 + target-arm/translate-a64.c | 139 target-arm/translate.c |9 +++ target-arm/translate.h |6 ++ 4 files changed, 155 insertions(+) create mode 100644 target-arm/translate-a64.c diff --git a/target-arm/Makefile.objs b/target-arm/Makefile.objs index d89b57c..e488edb 100644 --- a/target-arm/Makefile.objs +++ b/target-arm/Makefile.objs @@ -3,3 +3,4 @@ obj-$(CONFIG_SOFTMMU) += machine.o obj-$(CONFIG_KVM) += kvm.o obj-y += translate.o op_helper.o helper.o cpu.o obj-y += neon_helper.o iwmmxt_helper.o +obj-$(TARGET_AARCH64) += translate-a64.o diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c new file mode 100644 index 000..961792d --- /dev/null +++ b/target-arm/translate-a64.c @@ -0,0 +1,139 @@ +/* + * AArch64 translation + * + * Copyright (c) 2013 Alexander Graf ag...@suse.de + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see http://www.gnu.org/licenses/. + */ +#include stdarg.h +#include stdlib.h +#include stdio.h +#include string.h +#include inttypes.h + +#include cpu.h +#include tcg-op.h +#include qemu/log.h +#include translate.h +#include qemu/host-utils.h + +#include helper.h +#define GEN_HELPER 1 +#include helper.h + +static TCGv_i64 cpu_X[31]; +static TCGv_i64 cpu_pc; +static TCGv_i64 cpu_sp; +static TCGv_i32 pstate; + +static const char *regnames[] = +{ x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15, + x16, x17, x18, x19, x20, x21, x22, x23, + x24, x25, x26, x27, x28, x29, lr }; + +/* initialize TCG globals. */ +void a64_translate_init(void) +{ +int i; + +cpu_pc = tcg_global_mem_new_i64(TCG_AREG0, +offsetof(CPUARMState, pc), +pc); +cpu_sp = tcg_global_mem_new_i64(TCG_AREG0, +offsetof(CPUARMState, sp), +sp); +for (i = 0; i 31; i++) { +cpu_X[i] = tcg_global_mem_new_i64(TCG_AREG0, + offsetof(CPUARMState, xregs[i]), + regnames[i]); +} + +pstate = tcg_global_mem_new_i32(TCG_AREG0, +offsetof(CPUARMState, pstate), +pstate); +} + +void cpu_dump_state_a64(CPUARMState *env, FILE *f, fprintf_function cpu_fprintf, +int flags) +{ +int i; + +cpu_fprintf(f, PC=%016PRIx64 SP=%016PRIx64\n, env-pc, env-sp); +for(i = 0; i 31; i++) { +cpu_fprintf(f, X%02d=%016PRIx64, i, env-xregs[i]); +if ((i % 4) == 3) +cpu_fprintf(f, \n); +else +cpu_fprintf(f, ); +} +cpu_fprintf(f, XZR=%016PRIx64\n, env-xregs[31]); I guess you want to print XSP so the format string is wrong and the register is wrong too, it should be env-sp. OTOH I would have preferred xregs[31] to be sp :-) Laurent +cpu_fprintf(f, PSTATE=%c%c%c%c\n, +env-pstate PSTATE_N ? 'n' : '.', +env-pstate PSTATE_Z ? 'z' : '.', +env-pstate PSTATE_C ? 'c' : '.', +env-pstate PSTATE_V ? 'v' : '.'); +cpu_fprintf(f, \n); +} + +void gen_a64_set_pc_im(uint64_t val) +{ +tcg_gen_movi_i64(cpu_pc, val); +} + +static void gen_exception(int excp) +{ +TCGv_i32 tmp =
Re: [Qemu-devel] Latest QEMU compilation fails
On Tue, Apr 9, 2013 at 11:07 AM, Michal Novotny minov...@redhat.com wrote: Hi, I've just pulled the latest git repository of QEMU but it's failing the compilation on missing virtio-9p-device.h header file: CCx86_64-softmmu/target-i386/arch_memory_mapping.o CCx86_64-softmmu/target-i386/arch_dump.o CCx86_64-softmmu/target-i386/kvm.o CCx86_64-softmmu/target-i386/hyperv.o CCx86_64-softmmu/hw/9pfs/virtio-9p-device.o CCx86_64-softmmu/hw/block/virtio-blk.o CCx86_64-softmmu/hw/char/virtio-serial-bus.o CCx86_64-softmmu/hw/9pfs/virtio-9p-device.o /home/mig/Work/kvm-downstream/git/qemu/hw/9pfs/virtio-9p-device.c:19:30: fatal error: virtio-9p-device.h: No such file or directory compilation terminated. make[1]: *** [hw/9pfs/virtio-9p-device.o] Error 1 make: *** [subdir-x86_64-softmmu] Error 2 Does it do for others as well or is it just a local issue? Same here (47b5264eb3e1cd2825e48d28fd0d1b239ed53974). The missing file used to be under hw/9pfs/. Laurent Thanks, Michal -- Michal Novotny minov...@redhat.com, RHCE, Red Hat Virtualization | libvirt-php bindings | php-virt-control.org
Re: [Qemu-devel] Finding first TranslationBlock in user mode emulation
On Fri, Apr 5, 2013 at 4:15 PM, Peter Maydell peter.mayd...@linaro.org wrote: On 5 April 2013 15:06, Ivan Jovanovic jovanovic.d.i...@gmail.com wrote: When I examine in_asm and out_asm logs I notice that before my program starts emulating, dynamic linker code is being emulated, which is, of course, expected. My question is, is there a way in qemu (a flag or something similar) to determine during emulation when the dl code finishes executing and execution of the first TranslationBlock of my code starts? No, because QEMU doesn't care what your program does -- it just emulates code and doesn't need to know whether it is in the dynamic linker or the main program binary or a shared library or even code your program has dynamically generated at runtime. I suggest you look at the symbols for your guest binary (with 'nm' or 'objdump', for example) and match up the addresses in them with your code. (In particular this should let you find out which TB corresponds to the main() function.) You can either do this by hand or you could probably write a script to annotate the logs for you. There's an easier way that will work even with stripped execs: use the ELF entry point. readelf -l prog | grep -i entry HTH, Laurent
Re: [Qemu-devel] [PATCH v3 05/20] tcg-arm: Handle constant arguments to add2/sub2
On Thu, Mar 28, 2013 at 5:04 PM, Richard Henderson r...@twiddle.net wrote: On 03/28/2013 08:56 AM, Peter Maydell wrote: +#define TO_CPSR (1 20) This is the S bit; I think it would be helpful if our #define had a name that made that clearer... Suggestions? I thought TO_CPSR was clear... SET_FLAGS? +ARITH_TST = 0x8 21 | TO_CPSR, +ARITH_CMP = 0xa 21 | TO_CPSR, +ARITH_CMN = 0xb 21 | TO_CPSR, +ARITH_ORR = 0xc 21, +ARITH_MOV = 0xd 21, +ARITH_BIC = 0xe 21, +ARITH_MVN = 0xf 21, }; It feels a little ugly to OR in the S bit in this enum, but I guess it works. I think it actually makes everything a lot cleaner myself. Especially given the previous definition that required runtime checks for TST et al. I also find this cleaner. OTOH I understand that Peter finds it odd given that other operations (such as ADD) can optionally set flags. Laurent
Re: [Qemu-devel] qemu-x86_64 on i386 host: SIGSEGV
On Mon, Mar 25, 2013 at 4:03 PM, Richard Henderson r...@twiddle.net wrote: On 03/24/2013 03:59 AM, Peter Maydell wrote: PC is FF600400 so either we've messed it up already or this is just 64 bit address space doesn't fit in a 32 bit one. This is probably the fallback vdso address. Yes, it looks like __NR_vtime. Laurent I've previously sent patches to the list (several times) to add a real vdso to qemu for x86_64, so that the glibc will do the right thing, but the patches never got reviewed or applied. I could revive them if someone commits to reviewing them this time. r~
Re: [Qemu-devel] [PATCH 11/20] qemu-char: use a glib timeout instead of qemu-timer
Hello, On Tue, Mar 5, 2013 at 6:51 PM, Amit Shah amit.s...@redhat.com wrote: From: Anthony Liguori aligu...@us.ibm.com Signed-off-by: Anthony Liguori aligu...@us.ibm.com Signed-off-by: Amit Shah amit.s...@redhat.com --- qemu-char.c | 68 - 1 file changed, 45 insertions(+), 23 deletions(-) diff --git a/qemu-char.c b/qemu-char.c index eb0ac81..6dba943 100644 --- a/qemu-char.c +++ b/qemu-char.c @@ -990,12 +990,50 @@ typedef struct { int connected; int polling; int read_bytes; -QEMUTimer *timer; +guint timer_tag; } PtyCharDriver; static void pty_chr_update_read_handler(CharDriverState *chr); static void pty_chr_state(CharDriverState *chr, int connected); +static gboolean pty_chr_timer(gpointer opaque) +{ +struct CharDriverState *chr = opaque; +PtyCharDriver *s = chr-opaque; + +if (s-connected) { +goto out; +} +if (s-polling) { +/* If we arrive here without polling being cleared due + * read returning -EIO, then we are (re-)connected */ +pty_chr_state(chr, 1); +goto out; +} + +/* Next poll ... */ +pty_chr_update_read_handler(chr); + +out: +return FALSE; +} + +static void pty_chr_rearm_timer(CharDriverState *chr, int ms) +{ +PtyCharDriver *s = chr-opaque; + +if (s-timer_tag) { +g_source_remove(s-timer_tag); +s-timer_tag = 0; +} + +if (ms == 1000) { +s-timer_tag = g_timeout_add_seconds(1, pty_chr_timer, chr); It looks like g_timeout_add_seconds isn't available for poor people using some old distros (glib 2.12.3 here). Thanks, Laurent +} else { +s-timer_tag = g_timeout_add(ms, pty_chr_timer, chr); +} +} + static int pty_chr_write(CharDriverState *chr, const uint8_t *buf, int len) { PtyCharDriver *s = chr-opaque; @@ -1065,7 +1103,7 @@ static void pty_chr_update_read_handler(CharDriverState *chr) * timeout to the normal (much longer) poll interval before the * timer triggers. */ -qemu_mod_timer(s-timer, qemu_get_clock_ms(rt_clock) + 10); +pty_chr_rearm_timer(chr, 10); } static void pty_chr_state(CharDriverState *chr, int connected) @@ -1080,7 +1118,7 @@ static void pty_chr_state(CharDriverState *chr, int connected) /* (re-)connect poll interval for idle guests: once per second. * We check more frequently in case the guests sends data to * the virtual device linked to our pty. */ -qemu_mod_timer(s-timer, qemu_get_clock_ms(rt_clock) + 1000); +pty_chr_rearm_timer(chr, 1000); } else { if (!s-connected) qemu_chr_generic_open(chr); @@ -1088,23 +1126,6 @@ static void pty_chr_state(CharDriverState *chr, int connected) } } -static void pty_chr_timer(void *opaque) -{ -struct CharDriverState *chr = opaque; -PtyCharDriver *s = chr-opaque; - -if (s-connected) -return; -if (s-polling) { -/* If we arrive here without polling being cleared due - * read returning -EIO, then we are (re-)connected */ -pty_chr_state(chr, 1); -return; -} - -/* Next poll ... */ -pty_chr_update_read_handler(chr); -} static void pty_chr_close(struct CharDriverState *chr) { @@ -1117,8 +1138,9 @@ static void pty_chr_close(struct CharDriverState *chr) fd = g_io_channel_unix_get_fd(s-fd); g_io_channel_unref(s-fd); close(fd); -qemu_del_timer(s-timer); -qemu_free_timer(s-timer); +if (s-timer_tag) { +g_source_remove(s-timer_tag); +} g_free(s); qemu_chr_be_event(chr, CHR_EVENT_CLOSED); } @@ -1170,7 +1192,7 @@ static CharDriverState *qemu_chr_open_pty(QemuOpts *opts) chr-chr_add_watch = pty_chr_add_watch; s-fd = io_channel_from_fd(master_fd); -s-timer = qemu_new_timer_ms(rt_clock, pty_chr_timer, chr); +s-timer_tag = 0; return chr; } -- 1.8.1.2
Re: [Qemu-devel] [PATCH 11/20] qemu-char: use a glib timeout instead of qemu-timer
On Fri, Mar 15, 2013 at 4:44 PM, Anthony Liguori aligu...@us.ibm.com wrote: Laurent Desnogues laurent.desnog...@gmail.com writes: Hello, On Tue, Mar 5, 2013 at 6:51 PM, Amit Shah amit.s...@redhat.com wrote: From: Anthony Liguori aligu...@us.ibm.com Signed-off-by: Anthony Liguori aligu...@us.ibm.com Signed-off-by: Amit Shah amit.s...@redhat.com --- qemu-char.c | 68 - 1 file changed, 45 insertions(+), 23 deletions(-) diff --git a/qemu-char.c b/qemu-char.c index eb0ac81..6dba943 100644 --- a/qemu-char.c +++ b/qemu-char.c @@ -990,12 +990,50 @@ typedef struct { int connected; int polling; int read_bytes; -QEMUTimer *timer; +guint timer_tag; } PtyCharDriver; static void pty_chr_update_read_handler(CharDriverState *chr); static void pty_chr_state(CharDriverState *chr, int connected); +static gboolean pty_chr_timer(gpointer opaque) +{ +struct CharDriverState *chr = opaque; +PtyCharDriver *s = chr-opaque; + +if (s-connected) { +goto out; +} +if (s-polling) { +/* If we arrive here without polling being cleared due + * read returning -EIO, then we are (re-)connected */ +pty_chr_state(chr, 1); +goto out; +} + +/* Next poll ... */ +pty_chr_update_read_handler(chr); + +out: +return FALSE; +} + +static void pty_chr_rearm_timer(CharDriverState *chr, int ms) +{ +PtyCharDriver *s = chr-opaque; + +if (s-timer_tag) { +g_source_remove(s-timer_tag); +s-timer_tag = 0; +} + +if (ms == 1000) { +s-timer_tag = g_timeout_add_seconds(1, pty_chr_timer, chr); It looks like g_timeout_add_seconds isn't available for poor people using some old distros (glib 2.12.3 here). Can you test adding: #if !GLIB_CHECK_VERSION(2, 14, 0) static guint g_timeout_add_seconds(guint interval, GSourceFunc function, gpointer data) { return g_timeout_add(interval * 1000, function, data); } #endif That works fine, thanks for looking! We probably should introduce a glib-compat to centralize work arounds for older versions of glib... Agreed. Thanks again, Laurent Regards, Anthony Liguori Thanks, Laurent +} else { +s-timer_tag = g_timeout_add(ms, pty_chr_timer, chr); +} +} + static int pty_chr_write(CharDriverState *chr, const uint8_t *buf, int len) { PtyCharDriver *s = chr-opaque; @@ -1065,7 +1103,7 @@ static void pty_chr_update_read_handler(CharDriverState *chr) * timeout to the normal (much longer) poll interval before the * timer triggers. */ -qemu_mod_timer(s-timer, qemu_get_clock_ms(rt_clock) + 10); +pty_chr_rearm_timer(chr, 10); } static void pty_chr_state(CharDriverState *chr, int connected) @@ -1080,7 +1118,7 @@ static void pty_chr_state(CharDriverState *chr, int connected) /* (re-)connect poll interval for idle guests: once per second. * We check more frequently in case the guests sends data to * the virtual device linked to our pty. */ -qemu_mod_timer(s-timer, qemu_get_clock_ms(rt_clock) + 1000); +pty_chr_rearm_timer(chr, 1000); } else { if (!s-connected) qemu_chr_generic_open(chr); @@ -1088,23 +1126,6 @@ static void pty_chr_state(CharDriverState *chr, int connected) } } -static void pty_chr_timer(void *opaque) -{ -struct CharDriverState *chr = opaque; -PtyCharDriver *s = chr-opaque; - -if (s-connected) -return; -if (s-polling) { -/* If we arrive here without polling being cleared due - * read returning -EIO, then we are (re-)connected */ -pty_chr_state(chr, 1); -return; -} - -/* Next poll ... */ -pty_chr_update_read_handler(chr); -} static void pty_chr_close(struct CharDriverState *chr) { @@ -1117,8 +1138,9 @@ static void pty_chr_close(struct CharDriverState *chr) fd = g_io_channel_unix_get_fd(s-fd); g_io_channel_unref(s-fd); close(fd); -qemu_del_timer(s-timer); -qemu_free_timer(s-timer); +if (s-timer_tag) { +g_source_remove(s-timer_tag); +} g_free(s); qemu_chr_be_event(chr, CHR_EVENT_CLOSED); } @@ -1170,7 +1192,7 @@ static CharDriverState *qemu_chr_open_pty(QemuOpts *opts) chr-chr_add_watch = pty_chr_add_watch; s-fd = io_channel_from_fd(master_fd); -s-timer = qemu_new_timer_ms(rt_clock, pty_chr_timer, chr); +s-timer_tag = 0; return chr; } -- 1.8.1.2
Re: [Qemu-devel] About the Thread model in Qemu【 Is it one thread per vcpu?】
On Wed, Mar 13, 2013 at 10:51 AM, Peter Maydell peter.mayd...@linaro.org wrote: [...] For now, I need a emulator to emulate more cores efficiently.I want to analyze whether Qemu can be faster when it emulate more cores, while parallelism maybe a good choice. There has been some other research work with QEMU in this area, you should probably look it up. However it has all been we fixed enough of the big problems to be able to run some benchmarks and write our research paper, so none of this is actually in the QEMU sources. (The short answer I think is probably but it will depend a lot on workload, and there's a lot of software engineering required to do it reliably.) I know of two such variants: COREMU and HQEMU, but as far as I know the latter isn't freely available. Laurent
Re: [Qemu-devel] [PATCH v2 0/9] tcg-arm improvements
On Tue, Mar 12, 2013 at 7:43 AM, Richard Henderson r...@twiddle.net wrote: Changes v1-v2: * Use more helper functions to handle K and N constraints. * Improve add2/sub2. * Improve epilogues, as suggested in the previous thread. * Fix a typo in the name of the deposit helper. * Implement division for cortex-a15. I applied all of the patch series and tested nbench in user mode on a quad core Cortex-A9 (which means I didn't test the div part and the add2/sub2 part since it's i386 code). It works and gave some nice speedup. Thanks, Laurent r~ Richard Henderson (9): tcg-arm: Use bic to implement and with constant tcg-arm: Handle negated constant arguments to and/sub tcg-arm: Allow constant first argument to sub tcg-arm: Use tcg_out_dat_rIN for compares tcg-arm: Handle constant arguments to add2/sub2 tcg-arm: Improve constant generation tcg-arm: Fold epilogue into INDEX_op_exit_tb tcg-arm: Implement deposit for armv7 tcg-arm: Implement division instructions disas/arm.c | 4 + tcg/arm/tcg-target.c | 410 --- tcg/arm/tcg-target.h | 14 +- 3 files changed, 304 insertions(+), 124 deletions(-) -- 1.8.1.2
Re: [Qemu-devel] [PATCH 01/19] linux-user: stack_base is now mandatory on all targets
On Wed, Feb 8, 2012 at 10:46 AM, Laurent Desnogues laurent.desnog...@gmail.com wrote: On Fri, Feb 3, 2012 at 3:49 PM, riku.voi...@linaro.org wrote: From: Riku Voipio riku.voi...@linaro.org Signed-off-by: Riku Voipio riku.voi...@linaro.org --- linux-user/qemu.h |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/linux-user/qemu.h b/linux-user/qemu.h index 55ad9d8..30e2abd 100644 --- a/linux-user/qemu.h +++ b/linux-user/qemu.h @@ -123,10 +123,10 @@ typedef struct TaskState { #endif #if defined(TARGET_ARM) || defined(TARGET_M68K) || defined(TARGET_UNICORE32) /* Extra fields for semihosted binaries. */ -uint32_t stack_base; uint32_t heap_base; uint32_t heap_limit; #endif +uint32_t stack_base; Shouldn't this be abi_ulong instead of uint32_t? Ping... Yes it's more than a year old, but if Aarch64 support with semihosting ever comes to life :) Laurent int used; /* non zero if used */ struct image_info *info; struct linux_binprm *bprm; Laurent
Re: [Qemu-devel] [PATCH 03/12] ARM: Prepare translation for AArch64 code
On Wed, Mar 6, 2013 at 10:36 AM, Alexander Graf ag...@suse.de wrote: Am 06.03.2013 um 08:11 schrieb Laurent Desnogues laurent.desnog...@gmail.com: On Wed, Mar 6, 2013 at 3:01 AM, Alexander Graf ag...@suse.de wrote: This patch adds all the prerequisites for AArch64 support that didn't fit into split up patches. It extends important bits in the core cpu headers to also take AArch64 mode into account. Signed-off-by: Alexander Graf ag...@suse.de --- include/elf.h |2 + target-arm/cpu.h | 103 target-arm/translate.c | 42 +++ 3 files changed, 103 insertions(+), 44 deletions(-) diff --git a/include/elf.h b/include/elf.h index a21ea53..0ff0ea6 100644 --- a/include/elf.h +++ b/include/elf.h @@ -109,6 +109,8 @@ typedef int64_t Elf64_Sxword; #define EM_OPENRISC 92/* OpenCores OpenRISC */ #define EM_UNICORE32110 /* UniCore32 */ +#define EM_AARCH64 183 /* ARM 64-bit architecture */ + /* * This is an interim value that we will use until the committee comes diff --git a/target-arm/cpu.h b/target-arm/cpu.h index c28a0d9..ec292c9 100644 --- a/target-arm/cpu.h +++ b/target-arm/cpu.h @@ -19,13 +19,19 @@ #ifndef CPU_ARM_H #define CPU_ARM_H -#define TARGET_LONG_BITS 32 +#include config.h -#define ELF_MACHINEEM_ARM +#if defined (TARGET_ARM64) + /* AArch64 definitions */ +# define TARGET_LONG_BITS 64 +# define ELF_MACHINE EM_AARCH64 +#else +# define TARGET_LONG_BITS 32 +# define ELF_MACHINE EM_ARM +#endif #define CPUArchState struct CPUARMState -#include config.h #include qemu-common.h #include exec/cpu-defs.h @@ -79,6 +85,13 @@ struct arm_boot_info; typedef struct CPUARMState { /* Regs for current mode. */ uint32_t regs[16]; + +/* Regs for A64 mode. */ +uint64_t xregs[32]; I'm not sure it makes sense to allocate space for xregs[31]. If it is the zero register then you would anyway have special cases in translation code to discard writes to reg 31 and so it could be argued you could also special case reads from reg 31 (and you should so that the optimizer knows it's zero). Perhaps you could use xregs[31] as SP, which after all is a more regular register than xzr. Sp is a separate env field in my patch set. So yeah, I should just drop xregs[31] and always special case it I suppose. That's the best way to avoid accidents I hope :) I think that using xregs[31] as sp would ease code generation. No need to special case both xzr and sp. OTOH if you only declare 31 regs and you access xregs[31] by accident you'd read/write pc which might help uncover bugs faster, though I wouldn't want to use that feature to ease development :-) Laurent Alex Laurent +uint64_t pc; +uint64_t sp; +uint32_t pstate; + /* Frequently accessed CPSR bits are stored separately for efficiency. This contains all the other bits. Use cpsr_{read,write} to access the whole CPSR. */ @@ -154,6 +167,11 @@ typedef struct CPUARMState { uint32_t c15_power_control; /* power control */ } cp15; +/* System registers (AArch64) */ +struct { +uint64_t tpidr_el0; +} sr; + struct { uint32_t other_sp; uint32_t vecbase; @@ -170,7 +188,7 @@ typedef struct CPUARMState { /* VFP coprocessor state. */ struct { -float64 regs[32]; +float64 regs[64]; uint32_t xregs[16]; /* We store these fpcsr fields separately for convenience. */ @@ -241,6 +259,24 @@ int bank_number(int mode); void switch_mode(CPUARMState *, int); uint32_t do_arm_semihosting(CPUARMState *env); +static inline bool is_a64(CPUARMState *env) +{ +#ifdef TARGET_ARM64 +return true; +#else +return false; +#endif +} + +#define PSTATE_N_SHIFT 3 +#define PSTATE_N (1 PSTATE_N_SHIFT) +#define PSTATE_Z_SHIFT 2 +#define PSTATE_Z (1 PSTATE_Z_SHIFT) +#define PSTATE_C_SHIFT 1 +#define PSTATE_C (1 PSTATE_C_SHIFT) +#define PSTATE_V_SHIFT 0 +#define PSTATE_V (1 PSTATE_V_SHIFT) + /* you can call this signal handler from your SIGBUS and SIGSEGV signal handlers to inform the virtual CPU of exceptions. non zero is returned if the signal was handled by the virtual CPU. */ @@ -624,8 +660,13 @@ static inline bool cp_access_ok(CPUARMState *env, #define TARGET_PAGE_BITS 10 #endif -#define TARGET_PHYS_ADDR_SPACE_BITS 40 -#define TARGET_VIRT_ADDR_SPACE_BITS 32 +#if defined (TARGET_ARM64) +# define TARGET_PHYS_ADDR_SPACE_BITS 64 +# define TARGET_VIRT_ADDR_SPACE_BITS 64 +#else +# define TARGET_PHYS_ADDR_SPACE_BITS 40 +# define TARGET_VIRT_ADDR_SPACE_BITS 32 +#endif static inline CPUARMState *cpu_init(const char *cpu_model) { @@ -699,25 +740,31 @@ static inline void cpu_clone_regs(CPUARMState *env, target_ulong newsp) static inline void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc
Re: [Qemu-devel] [PATCH 04/12] ARM: Add AArch64 translation stub
(This mail was previously sent by accident to Alexander only.) On Wed, Mar 6, 2013 at 3:01 AM, Alexander Graf ag...@suse.de wrote: We should translate AArch64 mode separately from AArch32 mode. In AArch64 mode, registers look vastly different, instruction encoding is completely different, basically the system turns into a different machine. So let's do a simple if() in translate.c to decide whether we can handle the current code in the legacy AArch32 code or in the new AArch64 code. So far, the translation always complains about unallocated instructions. There is no emulator functionality in this patch! Signed-off-by: Alexander Graf ag...@suse.de --- target-arm/Makefile.objs |1 + target-arm/translate-a64.c | 139 target-arm/translate.c | 15 + target-arm/translate.h |6 ++ 4 files changed, 161 insertions(+), 0 deletions(-) create mode 100644 target-arm/translate-a64.c diff --git a/target-arm/Makefile.objs b/target-arm/Makefile.objs index d89b57c..f9f0de4 100644 --- a/target-arm/Makefile.objs +++ b/target-arm/Makefile.objs @@ -3,3 +3,4 @@ obj-$(CONFIG_SOFTMMU) += machine.o obj-$(CONFIG_KVM) += kvm.o obj-y += translate.o op_helper.o helper.o cpu.o obj-y += neon_helper.o iwmmxt_helper.o +obj-$(TARGET_ARM64) += translate-a64.o diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c new file mode 100644 index 000..f354d08 --- /dev/null +++ b/target-arm/translate-a64.c @@ -0,0 +1,139 @@ +/* + * AArch64 translation + * + * Copyright (c) 2013 Alexander Graf ag...@suse.de + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see http://www.gnu.org/licenses/. + */ +#include stdarg.h +#include stdlib.h +#include stdio.h +#include string.h +#include inttypes.h + +#include cpu.h +#include tcg-op.h +#include qemu/log.h +#include translate.h +#include qemu/host-utils.h + +#include helper.h +#define GEN_HELPER 1 +#include helper.h + +static TCGv_i64 cpu_X[32]; +static TCGv_i64 cpu_pc; +static TCGv_i64 cpu_sp; +static TCGv_i32 pstate; + +static const char *regnames[] = +{ x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15, + x16, x17, x18, x19, x20, x21, x22, x23, + x24, x25, x26, x27, x28, x29, lr, xzr }; Wouldn't it be better to use sp instead of xzr? I would expect the translator not to emit TCG ops with reg 31 meaning zero :-) Laurent + +/* initialize TCG globals. */ +void a64_translate_init(void) +{ +int i; + +cpu_pc = tcg_global_mem_new_i64(TCG_AREG0, +offsetof(CPUARMState, pc), +pc); +cpu_sp = tcg_global_mem_new_i64(TCG_AREG0, +offsetof(CPUARMState, sp), +sp); +for (i = 0; i 32; i++) { +cpu_X[i] = tcg_global_mem_new_i64(TCG_AREG0, + offsetof(CPUARMState, xregs[i]), + regnames[i]); +} + +pstate = tcg_global_mem_new_i32(TCG_AREG0, +offsetof(CPUARMState, pstate), +pstate); +} + +void cpu_dump_state_a64(CPUARMState *env, FILE *f, fprintf_function cpu_fprintf, +int flags) +{ +int i; + +cpu_fprintf(f, PC=%016PRIx64 SP=%016PRIx64\n, env-pc, env-sp); +for(i = 0; i 31; i++) { +cpu_fprintf(f, X%02d=%016PRIx64, i, env-xregs[i]); +if ((i % 4) == 3) +cpu_fprintf(f, \n); +else +cpu_fprintf(f, ); +} +cpu_fprintf(f, XZR=%016PRIx64\n, env-xregs[31]); +cpu_fprintf(f, PSTATE=%c%c%c%c\n, +env-pstate PSTATE_N ? 'n' : '.', +env-pstate PSTATE_Z ? 'z' : '.', +env-pstate PSTATE_C ? 'c' : '.', +env-pstate PSTATE_V ? 'v' : '.'); +cpu_fprintf(f, \n); +} + +void gen_a64_set_pc_im(uint64_t val) +{ +tcg_gen_movi_i64(cpu_pc, val); +} + +static void gen_exception(int excp) +{ +TCGv_i32 tmp = tcg_temp_new_i32(); +tcg_gen_movi_i32(tmp, excp); +gen_helper_exception(cpu_env, tmp); +tcg_temp_free_i32(tmp); +} + +static void gen_exception_insn(DisasContext *s, int offset, int excp) +{ +
Re: [Qemu-devel] [PATCH 03/12] ARM: Prepare translation for AArch64 code
On Wed, Mar 6, 2013 at 3:01 AM, Alexander Graf ag...@suse.de wrote: This patch adds all the prerequisites for AArch64 support that didn't fit into split up patches. It extends important bits in the core cpu headers to also take AArch64 mode into account. Signed-off-by: Alexander Graf ag...@suse.de --- include/elf.h |2 + target-arm/cpu.h | 103 target-arm/translate.c | 42 +++ 3 files changed, 103 insertions(+), 44 deletions(-) diff --git a/include/elf.h b/include/elf.h index a21ea53..0ff0ea6 100644 --- a/include/elf.h +++ b/include/elf.h @@ -109,6 +109,8 @@ typedef int64_t Elf64_Sxword; #define EM_OPENRISC 92/* OpenCores OpenRISC */ #define EM_UNICORE32110 /* UniCore32 */ +#define EM_AARCH64 183 /* ARM 64-bit architecture */ + /* * This is an interim value that we will use until the committee comes diff --git a/target-arm/cpu.h b/target-arm/cpu.h index c28a0d9..ec292c9 100644 --- a/target-arm/cpu.h +++ b/target-arm/cpu.h @@ -19,13 +19,19 @@ #ifndef CPU_ARM_H #define CPU_ARM_H -#define TARGET_LONG_BITS 32 +#include config.h -#define ELF_MACHINEEM_ARM +#if defined (TARGET_ARM64) + /* AArch64 definitions */ +# define TARGET_LONG_BITS 64 +# define ELF_MACHINE EM_AARCH64 +#else +# define TARGET_LONG_BITS 32 +# define ELF_MACHINE EM_ARM +#endif #define CPUArchState struct CPUARMState -#include config.h #include qemu-common.h #include exec/cpu-defs.h @@ -79,6 +85,13 @@ struct arm_boot_info; typedef struct CPUARMState { /* Regs for current mode. */ uint32_t regs[16]; + +/* Regs for A64 mode. */ +uint64_t xregs[32]; I'm not sure it makes sense to allocate space for xregs[31]. If it is the zero register then you would anyway have special cases in translation code to discard writes to reg 31 and so it could be argued you could also special case reads from reg 31 (and you should so that the optimizer knows it's zero). Perhaps you could use xregs[31] as SP, which after all is a more regular register than xzr. Laurent +uint64_t pc; +uint64_t sp; +uint32_t pstate; + /* Frequently accessed CPSR bits are stored separately for efficiency. This contains all the other bits. Use cpsr_{read,write} to access the whole CPSR. */ @@ -154,6 +167,11 @@ typedef struct CPUARMState { uint32_t c15_power_control; /* power control */ } cp15; +/* System registers (AArch64) */ +struct { +uint64_t tpidr_el0; +} sr; + struct { uint32_t other_sp; uint32_t vecbase; @@ -170,7 +188,7 @@ typedef struct CPUARMState { /* VFP coprocessor state. */ struct { -float64 regs[32]; +float64 regs[64]; uint32_t xregs[16]; /* We store these fpcsr fields separately for convenience. */ @@ -241,6 +259,24 @@ int bank_number(int mode); void switch_mode(CPUARMState *, int); uint32_t do_arm_semihosting(CPUARMState *env); +static inline bool is_a64(CPUARMState *env) +{ +#ifdef TARGET_ARM64 +return true; +#else +return false; +#endif +} + +#define PSTATE_N_SHIFT 3 +#define PSTATE_N (1 PSTATE_N_SHIFT) +#define PSTATE_Z_SHIFT 2 +#define PSTATE_Z (1 PSTATE_Z_SHIFT) +#define PSTATE_C_SHIFT 1 +#define PSTATE_C (1 PSTATE_C_SHIFT) +#define PSTATE_V_SHIFT 0 +#define PSTATE_V (1 PSTATE_V_SHIFT) + /* you can call this signal handler from your SIGBUS and SIGSEGV signal handlers to inform the virtual CPU of exceptions. non zero is returned if the signal was handled by the virtual CPU. */ @@ -624,8 +660,13 @@ static inline bool cp_access_ok(CPUARMState *env, #define TARGET_PAGE_BITS 10 #endif -#define TARGET_PHYS_ADDR_SPACE_BITS 40 -#define TARGET_VIRT_ADDR_SPACE_BITS 32 +#if defined (TARGET_ARM64) +# define TARGET_PHYS_ADDR_SPACE_BITS 64 +# define TARGET_VIRT_ADDR_SPACE_BITS 64 +#else +# define TARGET_PHYS_ADDR_SPACE_BITS 40 +# define TARGET_VIRT_ADDR_SPACE_BITS 32 +#endif static inline CPUARMState *cpu_init(const char *cpu_model) { @@ -699,25 +740,31 @@ static inline void cpu_clone_regs(CPUARMState *env, target_ulong newsp) static inline void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc, target_ulong *cs_base, int *flags) { -int privmode; -*pc = env-regs[15]; -*cs_base = 0; -*flags = (env-thumb ARM_TBFLAG_THUMB_SHIFT) -| (env-vfp.vec_len ARM_TBFLAG_VECLEN_SHIFT) -| (env-vfp.vec_stride ARM_TBFLAG_VECSTRIDE_SHIFT) -| (env-condexec_bits ARM_TBFLAG_CONDEXEC_SHIFT) -| (env-bswap_code ARM_TBFLAG_BSWAP_CODE_SHIFT); -if (arm_feature(env, ARM_FEATURE_M)) { -privmode = !((env-v7m.exception == 0) (env-v7m.control 1)); +if (is_a64(env)) { +*pc = env-pc; +*flags = 0; }
Re: [Qemu-devel] [PATCH] glib: Add compat wrapper for g_poll on old glib
On Tue, Feb 26, 2013 at 12:46 AM, Alexander Graf ag...@suse.de wrote: Older glib doesn't implement g_poll(). Most notably the glib version in use on SLE11 is on 2.18 which is hit by this. We do want to use g_poll() in the source however. So on older systems, just wrap it with functions that do exist on older versions. That compiles fine on my old CentOS workstation. Thanks, Laurent Signed-off-by: Anthony Liguori aligu...@us.ibm.com Signed-off-by: Alexander Graf ag...@suse.de --- include/qemu-common.h | 12 1 files changed, 12 insertions(+), 0 deletions(-) diff --git a/include/qemu-common.h b/include/qemu-common.h index 80016ad..5e13708 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -142,6 +142,18 @@ int qemu_main(int argc, char **argv, char **envp); void qemu_get_timedate(struct tm *tm, int offset); int qemu_timedate_diff(struct tm *tm); +#if !GLIB_CHECK_VERSION(2, 20, 0) +/* + * Glib before 2.20.0 doesn't implement g_poll, so wrap it to compile properly + * on older systems. + */ +static inline gint g_poll(GPollFD *fds, guint nfds, gint timeout) +{ +GMainContext *ctx = g_main_context_default(); +return g_main_context_get_poll_func(ctx)(fds, nfds, timeout); +} +#endif + /** * is_help_option: * @s: string to test -- 1.6.0.2
Re: [Qemu-devel] [PATCH] arm/translate.c: Fix adc_CC/sbc_CC implementation
On Mon, Feb 25, 2013 at 3:43 PM, Richard Henderson r...@twiddle.net wrote: On 2013-02-25 00:04, Peter Crosthwaite wrote: commits 49b4c31efcce45ab714f286f14fa5d5173f9069d and 2de68a4900ef6eb67380b0c128abfe1976bc66e8 reworked the implementation of adc_CC and sub_CC. The new implementations (on the TCG_TARGET_HAS_add2_i32 code path) are incorrect. The new logic is: CF:NF = 0:A +/- 0:CF CF:NF = CF:A +/- 0:B The lower 32 bits of the intermediate result stored in NF needs to be passes into the second addition in place of A (s/CF:A/CF:NF): CF:NF = 0:A +/- 0:CF CF:NF = CF:NF +/- 0:B This patch fixes the issue. Signed-off-by: Peter Crosthwaitepeter.crosthwa...@xilinx.com --- target-arm/translate.c |4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) Reviewed-by: Richard Henderson r...@twiddle.net Sorry for the breakage. Blue, please apply asap. I'm afraid this fix is not enough as I still can't get my Linux image to boot after applying it. Running this, my image boots: git checkout 49b4c31efcce45ab714f286f14fa5d5173f9069d target-arm Looking at the new sbc_cc, it seems that if t0=t1 and CF=1, then CF will be cleared while the old code in the helper did set it. Laurent PS: My image is the vexpress found here: http://releases.linaro.org/images/linaro-n/alip/11.08
Re: [Qemu-devel] [PATCH v2 0/3] Fix two carry problems in add2 patchset
On Mon, Feb 25, 2013 at 8:01 PM, Richard Henderson r...@twiddle.net wrote: V1-V2: Dangit, that's what trying to be speedy gets you, failing to make all of the changes you wrote in the comment. This time actually fix the sub2 to add2 like I intended, not just the not portion. The patch set is re-pushed at git://github.com/rth7680/qemu.git fix-carry Please *review* asap. My kernel is happier now :-) Thanks, Laurent r~ Peter Crosthwaite (1): arm/translate.c: Fix adc_CC/sbc_CC implementation Richard Henderson (2): target-arm: Fix sbc_CC carry target-ppc: Fix SUBFE carry target-arm/translate.c | 10 +- target-ppc/translate.c | 9 + 2 files changed, 10 insertions(+), 9 deletions(-) -- 1.8.1.2
Re: [Qemu-devel] [PATCH v4 00/10] main-loop: switch to g_poll(3) on POSIX hosts
On Wed, Feb 20, 2013 at 11:28 AM, Stefan Hajnoczi stefa...@redhat.com wrote: Amos Kong ak...@redhat.com reported that file descriptors numbered higher than 1024 could crash QEMU. This is due to the fixed size of the fd_set type used for select(2) event polling. This series converts the main-loop.c and aio-posix.c select(2) calls to g_poll(3). This eliminates the fd_set type and allows QEMU to scale to high numbers of file descriptors. The g_poll(3) interface is a portable version of the poll(2) system call. The difference to select(2) is that fine-grained events (G_IO_IN, G_IO_OUT, G_IO_HUP, G_IO_ERR, G_IO_PRI) can be monitored instead of just read/write/exception. Also, there is no limit to the file descriptor numbers that may be used, allowing applications to scale to many file descriptors. See the documentation for details: http://developer.gnome.org/glib/2.28/glib-The-Main-Event-Loop.html#g-poll g_poll isn't available in older glib versions used by CentOS (2.12 on CentOS 5.6). Thanks, Laurent The QEMU main loop works as follows today: 1. Call out to slirp, iohandlers, and glib sources to fill rfds/wfds/xfds with the file descriptors to select(2). 2. Perform the select(2) call. 3. Call out to slirp, iohandlers, and glib sources to handle events polled in rfds/wfds/xfds. The plan of attack is as follows: 1. Replace select(2) with g_poll(3). Use glue that converts between rfds/wfds/xfds and GPollFD so that the unconverted QEMU components still work. 2. Convert slirp, iohandlers, and glib source fill/poll functions to use GPollFD directly instead of rfds/wfds/xfds. 3. Drop the glue since all components now natively use GPollFD. 4. Convert aio-posix.c to g_poll(3) by reusing GPollFD. I have tested that the series builds and is bisectable on Linux and Windows hosts. But I have not done extensive testing on other host platforms or with long-term guests to check for performance regressions. v4: * Assign revents instead of bitwise OR to make code clearer [Laszlo] * Invoke pollfds_poll() only when select(2) succeeds [Laszlo] * Fix gpollfds_to_select(select_ret || g_poll_ret) call [Laszlo] v3: * Convert slirp/slirp.c to tabs to spaces [Blue Swirl] v2: * Replace custom Poller type with GArray [aliguori] Stefan Hajnoczi (10): main-loop: fix select_ret uninitialized variable warning main-loop: switch to g_poll() on POSIX hosts main-loop: switch POSIX glib integration to GPollFD slirp: slirp/slirp.c coding style cleanup slirp: switch to GPollFD iohandler: switch to GPollFD main-loop: drop rfds/wfds/xfds for good aio: extract aio_dispatch() from aio_poll() aio: convert aio_poll() to g_poll(3) aio: support G_IO_HUP and G_IO_ERR aio-posix.c | 130 - async.c | 2 + include/block/aio.h | 3 + include/qemu/main-loop.h | 4 +- iohandler.c | 40 ++- main-loop.c | 158 ++- slirp/libslirp.h | 6 +- slirp/main.h | 1 - slirp/slirp.c| 673 +-- slirp/socket.c | 9 - slirp/socket.h | 2 + stubs/slirp.c| 6 +- 12 files changed, 547 insertions(+), 487 deletions(-) -- 1.8.1.2
Re: [Qemu-devel] GTK UI is now the default
On Fri, Feb 22, 2013 at 12:04 AM, Anthony Liguori aligu...@us.ibm.com wrote: Since this is a pretty visible change for a lot of people, I thought I'd send a top level note. The GTK UI is now committed and is the default UI provided it's available. For anyone counting, it's been a little more than 7 years in the making: http://article.gmane.org/gmane.comp.emulators.qemu/9726 If you want to try it, make sure you have the gtk-2.0 development packages installed and the VTE development packages. It looks like gtk_widget_get_window isn't available in all gtk 2 versions, at least it's not in the gtk2 2.10 on my CentOS 5.6 system. Thanks, Laurent
Re: [Qemu-devel] GTK UI is now the default
On Fri, Feb 22, 2013 at 11:36 AM, Daniel P. Berrange berra...@redhat.com wrote: On Fri, Feb 22, 2013 at 11:12:35AM +0100, Laurent Desnogues wrote: On Fri, Feb 22, 2013 at 12:04 AM, Anthony Liguori aligu...@us.ibm.com wrote: Since this is a pretty visible change for a lot of people, I thought I'd send a top level note. The GTK UI is now committed and is the default UI provided it's available. For anyone counting, it's been a little more than 7 years in the making: http://article.gmane.org/gmane.comp.emulators.qemu/9726 If you want to try it, make sure you have the gtk-2.0 development packages installed and the VTE development packages. It looks like gtk_widget_get_window isn't available in all gtk 2 versions, at least it's not in the gtk2 2.10 on my CentOS 5.6 system. The docs say since 2.14 http://developer.gnome.org/gtk3/stable/GtkWidget.html#gtk-widget-get-window IIRC before that you could simply refernece widget-window directly. So you could probably try adding something like this: #if ! GTK_CHECK_VERSION(2, 14, 0) #define gtk_widget_get_window(w) (w)-window #endif and see if that fixes your build. That fixes the first error, but the others remain: ui/gtk.c: In function 'gd_resize': ui/gtk.c:254: warning: implicit declaration of function 'cairo_format_stride_for_width' ui/gtk.c:254: warning: nested extern declaration of 'cairo_format_stride_for_width' ui/gtk.c: In function 'gd_draw_event': ui/gtk.c:355: warning: implicit declaration of function 'gtk_widget_get_realized' ui/gtk.c:355: warning: nested extern declaration of 'gtk_widget_get_realized' ui/gtk.c: In function 'gd_vc_init': ui/gtk.c:864: error: 'VtePty' undeclared (first use in this function) ui/gtk.c:864: error: (Each undeclared identifier is reported only once ui/gtk.c:864: error: for each function it appears in.) ui/gtk.c:864: error: 'pty' undeclared (first use in this function) ui/gtk.c:885: error: 'GDK_KEY_2' undeclared (first use in this function) ui/gtk.c:897: warning: implicit declaration of function 'vte_pty_new_foreign' ui/gtk.c:897: warning: nested extern declaration of 'vte_pty_new_foreign' ui/gtk.c:899: warning: implicit declaration of function 'vte_terminal_set_pty_object' ui/gtk.c:899: warning: nested extern declaration of 'vte_terminal_set_pty_object' ui/gtk.c: In function 'gd_create_menus': ui/gtk.c:1009: error: 'GDK_KEY_f' undeclared (first use in this function) ui/gtk.c:1018: error: 'GDK_KEY_plus' undeclared (first use in this function) ui/gtk.c:1024: error: 'GDK_KEY_minus' undeclared (first use in this function) ui/gtk.c:1030: error: 'GDK_KEY_0' undeclared (first use in this function) ui/gtk.c:1045: error: 'GDK_KEY_g' undeclared (first use in this function) ui/gtk.c:1055: error: 'GDK_KEY_1' undeclared (first use in this function) ui/gtk.c: In function 'gtk_display_init': ui/gtk.c:1109: error: 'GDK_BLANK_CURSOR' undeclared (first use in this function) ui/gtk.c:1131: warning: implicit declaration of function 'gtk_widget_set_can_focus' ui/gtk.c:1131: warning: nested extern declaration of 'gtk_widget_set_can_focus' Thanks, Laurent
Re: [Qemu-devel] [PATCH for-1.4] linux-user: Restore cast to target type in get_user()
On Thu, Jan 31, 2013 at 1:50 PM, Peter Maydell peter.mayd...@linaro.org wrote: Commit 658f2dc97 accidentally dropped the cast to the target type of the value loaded by get_user(). The most visible effect of this would be that the sequence uint64_t v; get_user_u32(v, addr) would sign extend the 32 bit loaded value into v rather than zero extending as would be expected for a _u32 accessor. Put the cast back again to restore the old behaviour. This fixes the issue I mentioned a week ago, thanks. Reviewed-by: Laurent Desnogues laurent.desnog...@gmail.com Laurent Signed-off-by: Peter Maydell peter.mayd...@linaro.org --- linux-user/qemu.h |4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/linux-user/qemu.h b/linux-user/qemu.h index 31a220a..b10e957 100644 --- a/linux-user/qemu.h +++ b/linux-user/qemu.h @@ -306,12 +306,12 @@ static inline int access_ok(int type, abi_ulong addr, abi_ulong size) ((hptr), (x)), 0) #define __get_user_e(x, hptr, e)\ - ((x) =\ + ((x) = (typeof(*hptr))( \ __builtin_choose_expr(sizeof(*(hptr)) == 1, ldub_p, \ __builtin_choose_expr(sizeof(*(hptr)) == 2, lduw_##e##_p,\ __builtin_choose_expr(sizeof(*(hptr)) == 4, ldl_##e##_p, \ __builtin_choose_expr(sizeof(*(hptr)) == 8, ldq_##e##_p, abort \ - (hptr), 0) + (hptr)), 0) #ifdef TARGET_WORDS_BIGENDIAN # define __put_user(x, hptr) __put_user_e(x, hptr, be) -- 1.7.9.5
Re: [Qemu-devel] [PATCH 00/57] target-i386 eflags cleanup and bmi/adx extensions
On Thu, Jan 24, 2013 at 9:37 PM, Richard Henderson r...@twiddle.net wrote: On 01/24/2013 08:57 AM, Laurent Desnogues wrote: On Thu, Jan 24, 2013 at 5:52 PM, Richard Henderson r...@twiddle.net wrote: On 2013-01-24 08:46, Laurent Desnogues wrote: I gave a quick try a your branch. My host is an x86_64 CPU and I ran an i386 nbench in user mode. It works but some parts of the benchmark are noticeably slower (10%). Is that expected? Nope. Everything in there should be about speeding up... I'll have a look at it and see if there's something obvious. Let me know if you need more information or the binary (I compiled it some time ago with the oldest compiler I could find, gcc 2.96). Would you look and see how much variability you're getting? I had a quick look with a (newly built) nbench binary and don't see any speed regressions outside the error bars. Built with gcc 4.7.2, 4 trials each: Master Eflags3 Avg Stddev Avg Stddev Change Error Num S 585.92 18.25 573.79 4.39 -2.07% 3.12% String S51.14 1.1051.52 0.13 0.73% 2.15% Bitfield1.64E+008 4.04E+006 1.62E+008 8.63E+005 -1.32% 2.46% FP Emu 85.65 1.81114.74 1.18 33.97% 2.12% Fourier 1365.03 28.79 2813.78 11.72 106.13% 2.11% Assign 14.86 0.2414.89 0.21 0.22% 1.62% Idea723.70 43.31 884.20 4.55 22.18% 5.98% Huff495.27 8.72702.89 3.53 41.92% 1.76% N Net 0.290.010.730.00 149.99% 1.78% LU Decomp 9.260.1621.91 0.22 136.61% 1.70% I haven't looked to see where the massive fp improvements come from, but my first guess is not storing cc_op so often. Although perhaps it would keep us on the same page if we were talking about the exact same binary... Here are my results (5 runs): master eflags3eflags3 stddev stddev/master NUMERIC SO 488.59 3.82 507.37 3.871.04 STRING SOR 42.10 0.27 43.51 0.131.03 BITFIELD 105344000.00 885426.45 91472800.00 426835.680.87 FP EMULATI 22.70 0.54 23.07 0.381.02 FOURIER2669.56 16.12 2576.34 29.460.97 ASSIGNMENT8.40 0.117.68 0.020.91 IDEA 1535.88 7.73 1620.72 3.801.06 HUFFMAN 212.21 2.08 214.34 1.281.01 NEURAL NET0.75 0.000.76 0.001.01 LU DECOMPO 24.25 0.06 24.75 0.081.02 Host is an i7-920 running gcc 4.6.1 x86_64. nbench was compiled with gcc 2.96 targetting 32-bit. The binary I have is old and has been stripped and I alas have no access to gcc 2.96 anymore. Note that using a more recent compiler doesn't show that much difference. Laurent
Re: [Qemu-devel] [PATCH 00/57] target-i386 eflags cleanup and bmi/adx extensions
On Thu, Jan 24, 2013 at 5:02 AM, Richard Henderson r...@twiddle.net wrote: This is a re-working of Paolo's eflags cleanup from October, which I consider a pre-requisite to implementing the ADX extension. I've rearranged most of the patches in trivial ways, and some quite significantly. I've tested the result by running the FC17 installer in both i386 and x86_64 mode, and the bmi/adx extensions with small user-land test cases. The patch series is at git://github.com/rth7680/qemu.git eflags3 I gave a quick try a your branch. My host is an x86_64 CPU and I ran an i386 nbench in user mode. It works but some parts of the benchmark are noticeably slower (10%). Is that expected? Thanks, Laurent Please review. r~ Paolo Bonzini (19): test-i386: QEMU_PACKED is not defined here test-i386: make it compile with a recent gcc target-i386: use OT_* consistently target-i386: introduce gen_ext_tl target-i386: factor setting of s-cc_op handling for string functions target-i386: drop cc_op argument of gen_jcc1 target-i386: move carry computation for inc/dec closer to gen_op_set_cc_op target-i386: move eflags computation closer to gen_op_set_cc_op target-i386: compute eflags outside rcl/rcr helper target-i386: clean up sahf target-i386: use gen_jcc1 to compile loopz target-i386: factor gen_op_set_cc_op/tcg_gen_discard_tl around computing flags target-i386: add helper functions to get other flags target-i386: change gen_setcc_slow_T0 to gen_setcc_slow target-i386: optimize setcc instructions target-i386: use CCPrepare to generate conditional jumps target-i386: cleanup temporary macros for CCPrepare target-i386: introduce gen_cmovcc1 target-i386: kill cpu_T3 Richard Henderson (38): target-i386: Name the cc_op enumeration target-i386: Introduce set_cc_op target-i386: Don't clobber s-cc_op in gen_update_cc_op target-i386: Use gen_update_cc_op everywhere target-i386: do not compute eflags multiple times consecutively target-i386: no need to flush out cc_op before gen_eob target-i386: Move CC discards to set_cc_op target-i386: do not call helper to compute ZF/SF target-i386: use inverted setcond when computing NS or NZ target-i386: convert gen_compute_eflags_c to TCG target-i386: optimize setbe target-i386: optimize setle target-i386: introduce CCPrepare target-i386: introduce gen_prepare_cc target-i386: inline gen_prepare_cc_slow target-i386: expand cmov via movcond target-i386: use gen_op for cmps/scas target-i386: introduce gen_jcc1_noeob target-i386: Update cc_op before TCG branches target-i386: optimize flags checking after sub using CC_SRC2 target-i386: Use CC_SRC2 for ADC and SBB target-i386: Don't reference ENV through most of cc helpers target-i386: Make helper_cc_compute_all const target-i386: Tidy prefix parsing target-i386: Decode the VEX prefixes target-i386: Implement MOVBE target-i386: Implement ANDN target-i386: Implement BEXTR target-i386: Implement BLSR, BLSMSK, BLSI target-i386: Implement BZHI target-i386: Implement MULX target-i386: Implement PDEP, PEXT target-i386: Implement SHLX, SARX, SHRX target-i386: Implement RORX target-i386: Implement ADX extension target-i386: Use clz/ctz for bsf/bsr helpers target-i386: Simplify bsf/bsr flags computation target-i386: Implement tzcnt and fix lzcnt target-i386/cc_helper.c | 243 ++-- target-i386/cc_helper_template.h| 268 ++--- target-i386/cpu.c | 18 +- target-i386/cpu.h | 24 +- target-i386/helper.c| 11 +- target-i386/helper.h| 12 +- target-i386/int_helper.c| 69 +- target-i386/shift_helper_template.h | 12 +- target-i386/translate.c | 2195 +-- tests/tcg/test-i386.c | 10 +- 10 files changed, 1581 insertions(+), 1281 deletions(-) -- 1.7.11.7
Re: [Qemu-devel] [PATCH 6/8] linux-user: Rewrite __get_user/__put_user with __builtin_choose_expr
Hi, I know this was committed some time ago, but I have a comment about this patch. On Sat, Jan 5, 2013 at 1:39 AM, Richard Henderson r...@twiddle.net wrote: The previous formuation with multiple assignments to __typeof(*hptr) falls down when hptr is qualified const. E.g. with const struct S *p, p-f is also qualified const. With this formulation, there's no assignment to any local variable. Signed-off-by: Richard Henderson r...@twiddle.net --- linux-user/qemu.h | 63 +-- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/linux-user/qemu.h b/linux-user/qemu.h index 8a3538c..31a220a 100644 --- a/linux-user/qemu.h +++ b/linux-user/qemu.h @@ -287,36 +287,39 @@ static inline int access_ok(int type, abi_ulong addr, abi_ulong size) (type == VERIFY_READ) ? PAGE_READ : (PAGE_READ | PAGE_WRITE)) == 0; } -/* NOTE __get_user and __put_user use host pointers and don't check access. */ -/* These are usually used to access struct data members once the - * struct has been locked - usually with lock_user_struct(). - */ -#define __put_user(x, hptr)\ -({ __typeof(*hptr) pu_ = (x);\ -switch(sizeof(*hptr)) {\ -case 1: break;\ -case 2: pu_ = tswap16(pu_); break; \ -case 4: pu_ = tswap32(pu_); break; \ -case 8: pu_ = tswap64(pu_); break; \ -default: abort();\ -}\ -memcpy(hptr, pu_, sizeof(pu_)); \ -0;\ -}) - -#define __get_user(x, hptr) \ -({ __typeof(*hptr) gu_; \ -memcpy(gu_, hptr, sizeof(gu_)); \ -switch(sizeof(*hptr)) {\ -case 1: break; \ -case 2: gu_ = tswap16(gu_); break; \ -case 4: gu_ = tswap32(gu_); break; \ -case 8: gu_ = tswap64(gu_); break; \ -default: abort();\ -}\ -(x) = gu_; \ -0;\ -}) +/* NOTE __get_user and __put_user use host pointers and don't check access. + These are usually used to access struct data members once the struct has + been locked - usually with lock_user_struct. */ + +/* Tricky points: + - Use __builtin_choose_expr to avoid type promotion from ?:, + - Invalid sizes result in a compile time error stemming from + the fact that abort has no parameters. + - It's easier to use the endian-specific unaligned load/store + functions than host-endian unaligned load/store plus tswapN. */ + +#define __put_user_e(x, hptr, e)\ + (__builtin_choose_expr(sizeof(*(hptr)) == 1, stb_p, \ + __builtin_choose_expr(sizeof(*(hptr)) == 2, stw_##e##_p, \ + __builtin_choose_expr(sizeof(*(hptr)) == 4, stl_##e##_p, \ + __builtin_choose_expr(sizeof(*(hptr)) == 8, stq_##e##_p, abort \ + ((hptr), (x)), 0) + +#define __get_user_e(x, hptr, e)\ + ((x) =\ + __builtin_choose_expr(sizeof(*(hptr)) == 1, ldub_p, \ + __builtin_choose_expr(sizeof(*(hptr)) == 2, lduw_##e##_p,\ + __builtin_choose_expr(sizeof(*(hptr)) == 4, ldl_##e##_p, \ + __builtin_choose_expr(sizeof(*(hptr)) == 8, ldq_##e##_p, abort \ + (hptr), 0) For 8- and 16-bit quantities the load is explicitly unsigned through the use of ldub and lduw. But for 32-bit, ldl_[bl]e_p return an int, so if x is a 64-bit variable sign-extension will happen. I'm not sure this is desirable, for instance when using get_user_u32 which makes one think the result is an unsigned 32-bit value. Shouldn't ldul*_p functions be added and used in __get_user_e? Note I found this in private code, but wonder if some public code isn't affected by this. Thanks, Laurent + +#ifdef TARGET_WORDS_BIGENDIAN +# define __put_user(x, hptr) __put_user_e(x, hptr, be) +# define __get_user(x, hptr) __get_user_e(x, hptr, be) +#else +# define __put_user(x, hptr) __put_user_e(x, hptr, le) +# define __get_user(x, hptr) __get_user_e(x, hptr, le) +#endif /* put_user()/get_user() take a guest address and check access */ /* These are usually used to access an atomic data type, such as an int, -- 1.7.11.7
Re: [Qemu-devel] func call to safely shutdown VM and quit qemu?
On Mon, Dec 3, 2012 at 9:43 AM, Peter Cheung mcheun...@hotmail.com wrote: Hi Is there a func call to safely shutdown VM and quit qemu? I am using qemu_system_shutdown_request(). I don't know it that's the best way of quitting, but it works for me. HTH, Laurent
Re: [Qemu-devel] [PATCH v2 1/3] tcg/arm: fix TLB access in qemu-ld/st ops
On Tue, Oct 30, 2012 at 1:18 AM, y...@ohm.aurel32.net wrote: From: Aurelien Jarno aurel...@aurel32.net The TCG arm backend considers likely that the offset to the TLB entries does not exceed 12 bits for mem_index = 0. In practice this is not true for at least the MIPS target. The current patch fixes that by loading the bits 23-12 with a separate instruction, and using loads with address writeback, independently of the value of mem_idx. In total this allow a 24-bit offset, which is a lot more than needed. Cc: Andrzej Zaborowski balr...@gmail.com Cc: Peter Maydell peter.mayd...@linaro.org Cc: qemu-sta...@nongnu.org Signed-off-by: Aurelien Jarno aurel...@aurel32.net --- tcg/arm/tcg-target.c | 77 +++--- 1 file changed, 41 insertions(+), 36 deletions(-) diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c index e790bf0..03b1576 100644 --- a/tcg/arm/tcg-target.c +++ b/tcg/arm/tcg-target.c @@ -639,6 +639,22 @@ static inline void tcg_out_ld32_12(TCGContext *s, int cond, (rn 16) | (rd 12) | ((-im) 0xfff)); } +/* Offset pre-increment with base writeback. */ +static inline void tcg_out_ld32_12wb(TCGContext *s, int cond, + int rd, int rn, tcg_target_long im) +{ +/* ldr with writeback and both register equals is UNPREDICTABLE */ +assert(rd != rn); + +if (im = 0) { +tcg_out32(s, (cond 28) | 0x05b0 | +(rn 16) | (rd 12) | (im 0xfff)); +} else { +tcg_out32(s, (cond 28) | 0x0530 | +(rn 16) | (rd 12) | ((-im) 0xfff)); +} +} + static inline void tcg_out_st32_12(TCGContext *s, int cond, int rd, int rn, tcg_target_long im) { @@ -1071,7 +1087,7 @@ static inline void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc) { int addr_reg, data_reg, data_reg2, bswap; #ifdef CONFIG_SOFTMMU -int mem_index, s_bits; +int mem_index, s_bits, tlb_offset; TCGReg argreg; # if TARGET_LONG_BITS == 64 int addr_reg2; @@ -,19 +1127,15 @@ static inline void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc) TCG_REG_R0, TCG_REG_R8, CPU_TLB_SIZE - 1); tcg_out_dat_reg(s, COND_AL, ARITH_ADD, TCG_REG_R0, TCG_AREG0, TCG_REG_R0, SHIFT_IMM_LSL(CPU_TLB_ENTRY_BITS)); -/* In the - * ldr r1 [r0, #(offsetof(CPUArchState, tlb_table[mem_index][0].addr_read))] - * below, the offset is likely to exceed 12 bits if mem_index != 0 and - * not exceed otherwise, so use an - * add r0, r0, #(mem_index * sizeof *CPUArchState.tlb_table) - * before. - */ -if (mem_index) +/* We assume that the offset is contained within 24 bits. */ +tlb_offset = offsetof(CPUArchState, tlb_table[mem_index][0].addr_read); +assert(tlb_offset ~0xff == 0); +if (tlb_offset 0xfff) { tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_R0, TCG_REG_R0, -(mem_index (TLB_SHIFT 1)) | -((16 - (TLB_SHIFT 1)) 8)); -tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R0, -offsetof(CPUArchState, tlb_table[0][0].addr_read)); +0xa00 | (tlb_offset 12)); Isn't it 20 bits rather than 24 bits since the immediate is 8-bit right-rotated by 20? Laurent +tlb_offset = 0xfff; +} +tcg_out_ld32_12wb(s, COND_AL, TCG_REG_R1, TCG_REG_R0, tlb_offset); tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R1, TCG_REG_R8, SHIFT_IMM_LSL(TARGET_PAGE_BITS)); /* Check alignment. */ @@ -1131,15 +1143,14 @@ static inline void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc) tcg_out_dat_imm(s, COND_EQ, ARITH_TST, 0, addr_reg, (1 s_bits) - 1); # if TARGET_LONG_BITS == 64 -/* XXX: possibly we could use a block data load or writeback in - * the first access. */ -tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R0, -offsetof(CPUArchState, tlb_table[0][0].addr_read) + 4); +/* XXX: possibly we could use a block data load in the first access. */ +tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R0, 4); tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, TCG_REG_R1, addr_reg2, SHIFT_IMM_LSL(0)); # endif tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R0, -offsetof(CPUArchState, tlb_table[0][0].addend)); +offsetof(CPUTLBEntry, addend) +- offsetof(CPUTLBEntry, addr_read)); switch (opc) { case 0: @@ -1288,7 +1299,7 @@ static inline void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc) { int addr_reg, data_reg, data_reg2, bswap; #ifdef CONFIG_SOFTMMU -int mem_index, s_bits; +int mem_index, s_bits,
Re: [Qemu-devel] Singlestepping Target assembly instructions
On Tue, Oct 16, 2012 at 9:07 PM, Richard Henderson r...@twiddle.net wrote: On 2012-10-16 20:49, Emmanuel Blot wrote: Is there any way to force QEmu to disable the TB cache so that the translation occurs each time a target instruction is loaded, or a clever way to print out the address of each executed target instruction ? -d exec prints the entry point of each TB as it is executed. You can refer to the previously disassembled insns by reference. If I remember correctly, that only works if block chaining is disabled. Laurent
Re: [Qemu-devel] [PATCH 1/5] tcg/arm: fix TLB access in qemu-ld/st ops
On Tue, Oct 9, 2012 at 11:13 PM, Peter Maydell peter.mayd...@linaro.org wrote: On 9 October 2012 21:30, Aurelien Jarno aurel...@aurel32.net wrote: The TCG arm backend considers likely that the offset to the TLB entries does not exceed 12 bits for mem_index = 0. In practice this is not true for at list the MIPS target. The current patch fixes that by loading the bits 23-12 with a separate instruction, and using loads with address writeback, independently of the value of mem_idx. In total this allow a 24-bit offset, which is a lot more than needed. Would probably be good to assert() that the bits above 24 are zero, though. Cc: Andrzej Zaborowski balr...@gmail.com Cc: Peter Maydell peter.mayd...@linaro.org Cc: qemu-sta...@nongnu.org Signed-off-by: Aurelien Jarno aurel...@aurel32.net --- tcg/arm/tcg-target.c | 73 +- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c index 737200e..6cde512 100644 --- a/tcg/arm/tcg-target.c +++ b/tcg/arm/tcg-target.c @@ -624,6 +624,19 @@ static inline void tcg_out_ld32_12(TCGContext *s, int cond, (rn 16) | (rd 12) | ((-im) 0xfff)); } +/* Offset pre-increment with base writeback. */ +static inline void tcg_out_ld32_12wb(TCGContext *s, int cond, + int rd, int rn, tcg_target_long im) +{ Worth asserting that rd != rn ? (that's an UNPREDICTABLE case for ldr with writeback) +if (im = 0) { +tcg_out32(s, (cond 28) | 0x05b0 | + (rn 16) | (rd 12) | (im 0xfff)); +} else { +tcg_out32(s, (cond 28) | 0x0530 | + (rn 16) | (rd 12) | ((-im) 0xfff)); +} you could avoid the if() by just setting the U bit using (im = 0) 23 Clearer? Dunno. You also have to negate the im value so it's not enough to just change the opcode. Laurent Looks OK otherwise (though I haven't tested it.) Random aside: we emit a pretty long string of COND_EQ predicated code in these functions, especially in the 64 bit address and byteswapped cases. It might be interesting to see if performance on an A9, say, was any better if we just did a conditional branch over it instead... Probably borderline to no-effect, though. -- PMM
Re: [Qemu-devel] [PATCH 5/5] tcg/arm: improve direct jump
On Tue, Oct 9, 2012 at 10:30 PM, Aurelien Jarno aurel...@aurel32.net wrote: Use ldr pc, [pc, #-4] kind of branch for direct jump. This removes the need to flush the icache on TB linking, and allow to remove the limit on the code generation buffer. I'm not sure I like it. In general having data in the middle of code will increase I/D cache and I/D TLB pressure. This improves the boot-up speed of a MIPS guest by 11%. Boot speed is very specific. Did you test some other code? Also what was your host? Testing on a quad core Cortex-A9, using all of your patches (including TCG optimizations), I get this running nbench i386 in user mode: TEST: Iter/sec. : Old Index : New Index :: Pentium 90 : AMD K6/233 :::--- NUMERIC SORT: 119.48 : 3.06 : 1.01 STRING SORT : 7.7907 : 3.48 : 0.54 BITFIELD: 2.2049e+07 : 3.78 : 0.79 FP EMULATION: 5.094 : 2.44 : 0.56 FOURIER : 483.73 : 0.55 : 0.31 ASSIGNMENT : 1.778 : 6.77 : 1.75 IDEA: 341.43 : 5.22 : 1.55 HUFFMAN : 45.942 : 1.27 : 0.41 NEURAL NET :0.16667 : 0.27 : 0.11 LU DECOMPOSITION: 5.969 : 0.31 : 0.22 ===ORIGINAL BYTEMARK RESULTS== INTEGER INDEX : 3.319 FLOATING-POINT INDEX: 0.357 ===LINUX DATA BELOW=== MEMORY INDEX: 0.907 INTEGER INDEX : 0.774 FLOATING-POINT INDEX: 0.198 Not using this patch, I get: TEST: Iter/sec. : Old Index : New Index :: Pentium 90 : AMD K6/233 :::--- NUMERIC SORT: 121.88 : 3.13 : 1.03 STRING SORT : 7.8438 : 3.50 : 0.54 BITFIELD: 2.2597e+07 : 3.88 : 0.81 FP EMULATION: 5.1424 : 2.47 : 0.57 FOURIER : 466.04 : 0.53 : 0.30 ASSIGNMENT : 1.809 : 6.88 : 1.79 IDEA: 359.28 : 5.50 : 1.63 HUFFMAN : 46.225 : 1.28 : 0.41 NEURAL NET :0.16644 : 0.27 : 0.11 LU DECOMPOSITION: 5.77 : 0.30 : 0.22 ===ORIGINAL BYTEMARK RESULTS== INTEGER INDEX : 3.384 FLOATING-POINT INDEX: 0.349 ===LINUX DATA BELOW=== MEMORY INDEX: 0.922 INTEGER INDEX : 0.790 FLOATING-POINT INDEX: 0.193 This patch doesn't bring any speedup in that case. I guess we need more testing as a synthetic benchmark is as specific as kernel booting :-) Laurent Cc: Andrzej Zaborowski balr...@gmail.com Cc: Peter Maydell peter.mayd...@linaro.org Signed-off-by: Aurelien Jarno aurel...@aurel32.net --- exec-all.h | 24 exec.c |4 tcg/arm/tcg-target.c |7 +-- 3 files changed, 5 insertions(+), 30 deletions(-) diff --git a/exec-all.h b/exec-all.h index 6516da0..662b916 100644 --- a/exec-all.h +++ b/exec-all.h @@ -224,26 +224,10 @@ static inline void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr) #elif defined(__arm__) static inline void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr) { -#if !QEMU_GNUC_PREREQ(4, 1) -register unsigned long _beg __asm (a1); -register unsigned long _end __asm (a2); -register unsigned long _flg __asm (a3); -#endif - -/* we could use a ldr pc, [pc, #-4] kind of branch and avoid the flush */ -*(uint32_t *)jmp_addr = -(*(uint32_t *)jmp_addr ~0xff) -| (((addr - (jmp_addr + 8)) 2) 0xff); - -#if QEMU_GNUC_PREREQ(4, 1) -__builtin___clear_cache((char *) jmp_addr, (char *) jmp_addr + 4); -#else -/* flush icache */ -_beg = jmp_addr; -_end = jmp_addr + 4; -_flg = 0; -__asm __volatile__ (swi 0x9f0002 : : r (_beg), r (_end), r (_flg)); -#endif +/* Patch the branch destination. It uses a ldr pc, [pc, #-4] kind + of branch so we write absolute address and we don't need to + flush icache. */ +*(uint32_t *)jmp_addr = addr; } #elif defined(__sparc__) void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr); diff --git a/exec.c b/exec.c index 7899042..8d115ac 100644 --- a/exec.c +++ b/exec.c @@ -546,10 +546,6 @@ static void code_gen_alloc(unsigned long tb_size) start = (void *) 0x4000UL; if (code_gen_buffer_size (512 * 1024 * 1024)) code_gen_buffer_size = (512 * 1024 * 1024); -#elif defined(__arm__) -/* Keep the buffer no bigger than 16MB to branch between blocks */ -if (code_gen_buffer_size 16 * 1024 * 1024) -
Re: [Qemu-devel] [PATCH 5/5] tcg/arm: improve direct jump
On Wed, Oct 10, 2012 at 4:28 PM, Aurelien Jarno aurel...@aurel32.net wrote: On Wed, Oct 10, 2012 at 03:21:48PM +0200, Laurent Desnogues wrote: On Tue, Oct 9, 2012 at 10:30 PM, Aurelien Jarno aurel...@aurel32.net wrote: Use ldr pc, [pc, #-4] kind of branch for direct jump. This removes the need to flush the icache on TB linking, and allow to remove the limit on the code generation buffer. I'm not sure I like it. In general having data in the middle of code will increase I/D cache and I/D TLB pressure. Agreed. On the other hand, this patch remove the synchronization of the instruction cache for TB linking/unlinking. TB linking/unlinking should happen less often than code execution. This improves the boot-up speed of a MIPS guest by 11%. Boot speed is very specific. Did you test some other code? Also what was your host? I tested it on a Cortex-A8 machine. I have only tested MIPS, but I can do more tests, like running the openssl testsuite in the emulated guest. Yes, please. [...] This doesn't really surprise me. The goal of the patch is to remove the limit of 16MB for the generated code. I really doubt you reach such a limit in user mode unless you use some complex code. On the other hand in system mode, this can be already reached once the whole guest kernel is translated, so cached code is dropped and has to be re-translated regularly. Re-translating guest code is clearly more expensive than the increase of I/D cache and I/D TLB pressure. Ha yes, that's a real problem. What about having some define and/or runtime flag to keep both caches sync and your ldr PC change in QEMU? The other way to allow more than 16MB of generated code would be to disable direct jump on ARM. It adds one 32-bit constant loading + one memory load, but then you don't have the I/D cache and TLB issue. The performance hit would be even worse :-) Laurent
Re: [Qemu-devel] [PATCH 0/2] make guest-base support mandatory for TCG backends
On Tue, Oct 9, 2012 at 3:28 PM, Richard Henderson r...@twiddle.net wrote: On 10/09/2012 06:16 AM, Peter Maydell wrote: That is, we could drop CONFIG_USER_GUEST_BASE. Does anybody have a practical use case for the --disable-guest-base configuration? Nope, because the backends are good at eliminating the overhead during tcg_target_qemu_prologue if guest_base turns out to be zero. I had some difficulties getting the same code generated with --enable-guest-base as the one generated with --disable-guest-base, for ARM on x86_64. One has to play with -B 0 and -R, and that was not obvious, while --disable-guest-base was obvious for me :-). Guest ARM, host x86_64. $ ./arm-linux-user/qemu-arm -singlestep -d in_asm,out_asm /work/qemu/Tests/arm/gcc-a9-O3 x IN: 0x8138: e59fc010 ldr ip, [pc, #16] ; 0x8150 OUT: [size=53] 0x602296c0: mov$0x8150,%ebp 0x602296c5: mov$0x2ac1f9b61000,%rdi 0x602296cf: add%rbp,%rdi 0x602296d2: mov(%rdi),%ebp 0x602296d4: mov%ebp,0x30(%r14) 0x602296d8: jmpq 0x602296dd 0x602296dd: mov$0x813c,%ebp 0x602296e2: mov%ebp,0x3c(%r14) 0x602296e6: mov$0x2ac1f6760281,%rax 0x602296f0: jmpq 0x6226d6b6 Running with -B 0 and playing with -R values until the program succeeds in loading finally produces the same code as when compiling QEMU with --disable-guest-base OUT: [size=41] 0x602281a0: mov$0x8150,%ebp 0x602281a5: mov0x0(%rbp),%ebp 0x602281a8: mov%ebp,0x30(%r14) 0x602281ac: jmpq 0x602281b1 0x602281b1: mov$0x813c,%ebp 0x602281b6: mov%ebp,0x3c(%r14) 0x602281ba: mov$0x2b9b26096281,%rax 0x602281c4: jmpq 0x6226c1b6 I know the reason why reserved_va was set to the current value (cf 288e65b9eea0c9b3cbe21be46f3e24e4e8b2a090), but I thought it was interesting to point out the above. Laurent
Re: [Qemu-devel] [PATCH] configure: Allow builds without any system or user emulation
On Sat, Sep 22, 2012 at 11:37 PM, Stefan Weil s...@weilnetz.de wrote: Am 14.09.2012 19:02, schrieb Stefan Weil: The old code aborted configure when no emulation target was selected. Even after removing the 'exit 1', it tried to read from STDIN when QEMU was configured with configure' '--disable-user' '--disable-system' This is fixed here. Signed-off-by: Stefan Weils...@weilnetz.de --- This patch can be applied after 66d5499b3 was reverted. It also works on top of 66d5499b3. In this case only Makefile needs modifications, and the configure part of the patch must be removed. Regards Stefan Weil Makefile |5 + configure |4 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 9523e05..d38ac0f 100644 --- a/Makefile +++ b/Makefile @@ -52,8 +52,13 @@ SUBDIR_MAKEFLAGS=$(if $(V),,--no-print-directory) BUILD_DIR=$(BUILD_DIR) SUBDIR_DEVICES_MAK=$(patsubst %, %/config-devices.mak, $(TARGET_DIRS)) SUBDIR_DEVICES_MAK_DEP=$(patsubst %, %/config-devices.mak.d, $(TARGET_DIRS)) +ifeq ($(SUBDIR_DEVICES_MAK),) +config-all-devices.mak: + $(call quiet-command,echo '# no devices' $@, GEN $@) +else config-all-devices.mak: $(SUBDIR_DEVICES_MAK) $(call quiet-command,cat $(SUBDIR_DEVICES_MAK) | grep =y | sort -u $@, GEN $@) +endif -include $(SUBDIR_DEVICES_MAK_DEP) diff --git a/configure b/configure index fc27bd9..a9305f3 100755 --- a/configure +++ b/configure @@ -1331,10 +1331,6 @@ if test -z $target_list ; then else target_list=`echo $target_list | sed -e 's/,/ /g'` fi -if test -z $target_list ; then -echo No targets enabled -exit 1 -fi # see if system emulation was really requested case $target_list in *-softmmu *) softmmu=yes Ping? 66d5499b3 was reverted, so my patch can be applied if nobody objects. Works fine here. I think Daniel's original proposal to use some keyword for target-list is more convenient than having to use both --disable-user and --disable-system, but that's no big deal :-) Laurent
Re: [Qemu-devel] [PATCH 0/9] tcg/optimize: rework copy propagation
On Wed, Sep 19, 2012 at 10:00 PM, Aurelien Jarno aurel...@aurel32.net wrote: This patch series rework the copy propagation in order to generate better code. The first two patches are cleanup and bug fixes, the third patch is the heart of the series, and the remaining ones are small optimizations using the new copy propagation. I have measured a decrease of the generated code size of about 4%, with a gain in speed between 0 and 2% depending on the workload. For better benefits in ARM emulation, it should be used with the setcond patches series I have sent a few days ago. Using this patch series along with the ARM one, I got a nice gain of speed in user mode for SPECCPU 2k gcc: without the patches the integrate.i data set runs in 13.6s, with them the time is reduced to 12.4s (host CPU is i5 2400 64-bit mode). Nice job! Laurent Aurelien Jarno (9): tcg/optimizer: remove TCG_TEMP_ANY tcg/optimizer: check types in copy propagation tcg/optimizer: rework copy progagation tcg/optimize: do copy propagation for all operations tcg/optimize: optimize op r, a, a = mov r, a tcg/optimize: optimize op r, a, a = movi r, 0 tcg/optimize: further optimize brcond/setcond tcg/optimize: prefer the op a, a, b form for commutative ops tcg: remove #ifdef #endif around TCGOpcode tests tcg/optimize.c | 326 ++-- tcg/tcg.c | 16 +-- 2 files changed, 200 insertions(+), 142 deletions(-) -- 1.7.10.4
Re: [Qemu-devel] [PATCH 3/5] target-arm: convert shl and shr helpers to TCG
On Mon, Sep 17, 2012 at 1:08 AM, Aurelien Jarno aurel...@aurel32.net wrote: Now that the setcond TCG op is available, it's possible to replace shl and shr helpers by TCG code. The code generated by TCG is slightly longer than the code generated by GCC for the helper but is still worth it as this avoid all the consequences of using an helper: globals saved back to memory, no possible optimization, call overhead, etc. Cc: Peter Maydell peter.mayd...@linaro.org Signed-off-by: Aurelien Jarno aurel...@aurel32.net --- target-arm/helper.h|2 -- target-arm/op_helper.c | 16 target-arm/translate.c | 32 3 files changed, 28 insertions(+), 22 deletions(-) diff --git a/target-arm/helper.h b/target-arm/helper.h index 7151e28..b123d3e 100644 --- a/target-arm/helper.h +++ b/target-arm/helper.h @@ -145,8 +145,6 @@ DEF_HELPER_5(neon_tbl, i32, env, i32, i32, i32, i32) DEF_HELPER_3(adc_cc, i32, env, i32, i32) DEF_HELPER_3(sbc_cc, i32, env, i32, i32) -DEF_HELPER_3(shl, i32, env, i32, i32) -DEF_HELPER_3(shr, i32, env, i32, i32) DEF_HELPER_3(sar, i32, env, i32, i32) DEF_HELPER_3(shl_cc, i32, env, i32, i32) DEF_HELPER_3(shr_cc, i32, env, i32, i32) diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c index 6095f24..5fcd12c 100644 --- a/target-arm/op_helper.c +++ b/target-arm/op_helper.c @@ -355,22 +355,6 @@ uint32_t HELPER(sbc_cc)(CPUARMState *env, uint32_t a, uint32_t b) /* Similarly for variable shift instructions. */ -uint32_t HELPER(shl)(CPUARMState *env, uint32_t x, uint32_t i) -{ -int shift = i 0xff; -if (shift = 32) -return 0; -return x shift; -} - -uint32_t HELPER(shr)(CPUARMState *env, uint32_t x, uint32_t i) -{ -int shift = i 0xff; -if (shift = 32) -return 0; -return (uint32_t)x shift; -} - uint32_t HELPER(sar)(CPUARMState *env, uint32_t x, uint32_t i) { int shift = i 0xff; diff --git a/target-arm/translate.c b/target-arm/translate.c index 19bb1e8..9c29065 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -440,6 +440,26 @@ static void gen_sub_CC(TCGv dest, TCGv t0, TCGv t1) tcg_gen_mov_i32(dest, cpu_NF); } +#define GEN_SHIFT(name)\ +static void gen_##name(TCGv dest, TCGv t0, TCGv t1)\ +{ \ +TCGv tmp1, tmp2; \ +tmp1 = tcg_temp_new_i32(); \ +tcg_gen_andi_i32(tmp1, t1, 0xff); \ +tmp2 = tcg_temp_new_i32(); \ +tcg_gen_setcondi_i32(TCG_COND_GE, tmp2, tmp1, 32); \ +tcg_gen_andi_i32(tmp1, tmp1, 0x1f);\ I don't think the 'and 0x1f' is needed given that later you'll and with 0 if the shift amount is = 32. +tcg_gen_##name##_i32(dest, t0, tmp1); \ +tcg_temp_free_i32(tmp1); \ +tcg_gen_subi_i32(tmp2, tmp2, 1); \ +tcg_gen_and_i32(dest, dest, tmp2); \ +tcg_temp_free_i32(tmp2); \ +} Laurent +GEN_SHIFT(shl) +GEN_SHIFT(shr) +#undef GEN_SHIFT + + /* FIXME: Implement this natively. */ #define tcg_gen_abs_i32(t0, t1) gen_helper_abs(t0, t1) @@ -516,8 +536,12 @@ static inline void gen_arm_shift_reg(TCGv var, int shiftop, } } else { switch (shiftop) { -case 0: gen_helper_shl(var, cpu_env, var, shift); break; -case 1: gen_helper_shr(var, cpu_env, var, shift); break; +case 0: +gen_shl(var, var, shift); +break; +case 1: +gen_shr(var, var, shift); +break; case 2: gen_helper_sar(var, cpu_env, var, shift); break; case 3: tcg_gen_andi_i32(shift, shift, 0x1f); tcg_gen_rotr_i32(var, var, shift); break; @@ -9161,7 +9185,7 @@ static void disas_thumb_insn(CPUARMState *env, DisasContext *s) break; case 0x2: /* lsl */ if (s-condexec_mask) { -gen_helper_shl(tmp2, cpu_env, tmp2, tmp); +gen_shl(tmp2, tmp2, tmp); } else { gen_helper_shl_cc(tmp2, cpu_env, tmp2, tmp); gen_logic_CC(tmp2); @@ -9169,7 +9193,7 @@ static void disas_thumb_insn(CPUARMState *env, DisasContext *s) break; case 0x3: /* lsr */ if (s-condexec_mask) { -gen_helper_shr(tmp2, cpu_env, tmp2, tmp); +gen_shr(tmp2, tmp2, tmp); } else { gen_helper_shr_cc(tmp2, cpu_env, tmp2, tmp); gen_logic_CC(tmp2); -- 1.7.10.4
Re: [Qemu-devel] [PATCH 3/5] target-arm: convert shl and shr helpers to TCG
On Mon, Sep 17, 2012 at 11:43 AM, Peter Maydell peter.mayd...@linaro.org wrote: On 17 September 2012 10:30, Laurent Desnogues laurent.desnog...@gmail.com wrote: On Mon, Sep 17, 2012 at 1:08 AM, Aurelien Jarno aurel...@aurel32.net wrote: +#define GEN_SHIFT(name)\ +static void gen_##name(TCGv dest, TCGv t0, TCGv t1)\ +{ \ +TCGv tmp1, tmp2; \ +tmp1 = tcg_temp_new_i32(); \ +tcg_gen_andi_i32(tmp1, t1, 0xff); \ +tmp2 = tcg_temp_new_i32(); \ +tcg_gen_setcondi_i32(TCG_COND_GE, tmp2, tmp1, 32); \ +tcg_gen_andi_i32(tmp1, tmp1, 0x1f);\ I don't think the 'and 0x1f' is needed given that later you'll and with 0 if the shift amount is = 32. The TCG shift operations are undefined behaviour (not merely undefined result) if the shift is = 32, so we must avoid doing that even if we're going to throw away the answer. That's odd that it doesn't just state that the result is undefined. I wonder what undefined behavior means. I understand what undefined behavior (as opposed tu undefined result) means for divisions by 0, but not for a shift larger than data type width. Anyway that makes my comment about removing the 0x1f pointless. Laurent
Re: [Qemu-devel] [PATCH v2] Add ability to disable build of all targets
On Tue, Sep 11, 2012 at 8:56 PM, Eduardo Habkost ehabk...@redhat.com wrote: On Mon, Sep 10, 2012 at 06:00:54PM -0500, Anthony Liguori wrote: Daniel P. Berrange berra...@redhat.com writes: From: Daniel P. Berrange berra...@redhat.com Allow passing of '--target-list=' to configure to request that all targets are to be disabled. This allows for doing a very fast tools-only build of things like qemu-img, qemu-io, qemu-nbd. Signed-off-by: Daniel P. Berrange berra...@redhat.com Applied. Thanks. This patch broke the --target-list option: $ ./configure --target-list=x86_64-linux-user,x86_64-softmmu [...] ERROR: Target 'x86_64-linux-user,x86_64-softmmu' not recognised $ It also seems to not even allow to use --target-list=. On my setup (CentOS 5.6, building in the git directory) the build process just hangs here: $ make V=1 cat | grep =y | sort -u config-all-devices.mak Laurent Regards, Anthony Liguori --- configure | 13 - 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/configure b/configure index 75dc9da..472374e 100755 --- a/configure +++ b/configure @@ -127,7 +127,7 @@ cc_i386=i386-pc-linux-gnu-gcc libs_qga= debug_info=yes -target_list= +target_list=DEFAULT # Default value for a variable defining feature foo. # * foo=no feature will only be used if --enable-foo arg is given @@ -1319,15 +1319,10 @@ if ! $python -c 'import sys; sys.exit(sys.version_info (2,4) or sys.version_ exit 1 fi -if test -z $target_list ; then -target_list=$default_target_list -else -target_list=`echo $target_list | sed -e 's/,/ /g'` -fi -if test -z $target_list ; then -echo No targets enabled -exit 1 +if test $target_list = DEFAULT ; then +target_list=`echo $default_target_list | sed -e 's/,/ /g'` fi + # see if system emulation was really requested case $target_list in *-softmmu *) softmmu=yes -- 1.7.11.2 -- Eduardo
Re: [Qemu-devel] [PATCH] configure: fix --target-list=target, target, ... option
Sorry, I had missed this patch... On Tue, Sep 11, 2012 at 9:02 PM, Eduardo Habkost ehabk...@redhat.com wrote: commit 66d5499b3754b83c09487259c08fe2ce73188a59 broke the support for comma-separated target lists on the --target-list option. e.g.: $ ./configure --target-list=x86_64-linux-user,x86_64-softmmu [...] ERROR: Target 'x86_64-linux-user,x86_64-softmmu' not recognised $ This patch restores that ability. Signed-off-by: Eduardo Habkost ehabk...@redhat.com Cc: Daniel P. Berrange berra...@redhat.com Cc: Anthony Liguori anth...@codemonkey.ws --- configure | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/configure b/configure index 7656c32..9ee7038 100755 --- a/configure +++ b/configure @@ -1323,7 +1323,9 @@ if ! $python -c 'import sys; sys.exit(sys.version_info (2,4) or sys.version_ fi if test $target_list = DEFAULT ; then -target_list=`echo $default_target_list | sed -e 's/,/ /g'` +target_list=$default_target_list +else +target_list=`echo $target_list | sed -e 's/,/ /g'` fi This works for me too. But I still can't get what the original patch posted by Daniel Berrange intended to do: $ ./configure --target-list= $ make V=1 cat | grep =y | sort -u config-all-devices.mak And it of course hangs there. Creating an empty config-all-devices.mak before running make solves the issue. Laurent # see if system emulation was really requested -- 1.7.11.4
Re: [Qemu-devel] How does ARM VFP is emulated?
On Thursday, August 16, 2012, Oi Khote oikh...@hotmail.com wrote: So how exactly does VFP is being emulated. QEMU uses a library for FP computations, based on the softfloat package. Laurent
Re: [Qemu-devel] qemu log function to print out the registers of the guest
On Thu, Aug 16, 2012 at 7:49 PM, Steven wangwangk...@gmail.com wrote: [...] I want to get the guest memory address in the instruction mov 0x4(%ebx) %eax, whic is 0x4(%ebx). Since %ebx is not resolved until the execution time, the code in softmmu_header.h does not generate any hit or miss information. Do you know any place that I could resolve the memory access address? Thanks. You'll have to generate code. Look at how helpers work. Laurent
Re: [Qemu-devel] Is the return address of get_page_addr_code guest physical address?
On Sat, Aug 11, 2012 at 5:41 AM, Steven wangwangk...@gmail.com wrote: [...] I want to get the guest physical address of a pc. I note the part of the function cpu_x86_handle_mmu_fault will do something like page walking to convert a pc to its guest physical address. I think this is the guest physical address I need. However, there is no other function available to do this page walking. So I am thinking add a function to do the conversion. If you want the translation of any guest virtual address to guest physical address then cpu_x86_handle_mmu_fault is close to what you want. Of course you'd have to rewrite it as you probably don't want your function to change the CPU env. Perhaps cpu_get_phys_page_debug is even closer to what you need, but I'm not familiar enough with x86 to say for sure. Laurent
Re: [Qemu-devel] Smc support in qemu
On Thu, Aug 2, 2012 at 2:20 PM, Itaru Kitayama kitay...@cl.bb4u.ne.jp wrote: The recent upstream highbank kernel uses smc to enable its L2 cache, but on a qemu virt machine it is not supported yet. Is it likely supported by qemu soon? What is the time frame for that? For other people reading this, SMC is the instruction used by ARM processors to call routines written in the Secure world. The issue here is that even if you had support for that instruction, it wouldn't be enough because each platform has different SMC implementations, which code of course is not public, and quite often even the API is undocumented. Riku Voipio has a branch which implements some basic security stuff here: http://git.linaro.org/gitweb?p=people/rikuvoipio/qemu.git;a=shortlog;h=refs/heads/linaro (That branch is old, but it can help to understand the basic stuff needed.) In helper.c: case EXCP_SMC: if (semihosting_enabled) { cpu_abort(env, SMC handling under semihosting not implemented\n); return; } if ((env-uncached_cpsr CPSR_M) == ARM_CPU_MODE_SMC) { env-cp15.c1_secfg = ~1; } offset = env-thumb ? 2 : 0; new_mode = ARM_CPU_MODE_SMC; addr = 0x08; mask = CPSR_A | CPSR_I | CPSR_F; break; If you know the SMC API, you could perhaps plug some code to run when semihosting_enabled is set (though that might conflict with the other uses of that flag, so one could perhaps use some other flag that would say if we simulate SMC code for OMAP, or Calxeda, or Tegra 3, etc.). If support can not be expected any time soon, how do we work around the issue? I have been using qemu-linaro-1.1.50-2012.07. I'm afraid the easiest way is to patch the kernel. Laurent
Re: [Qemu-devel] Smc support in qemu
On Fri, Aug 3, 2012 at 10:37 AM, Peter Maydell peter.mayd...@linaro.org wrote: [...] http://git.linaro.org/gitweb?p=people/rikuvoipio/qemu.git;a=shortlog;h=refs/heads/linaro That's just a personal (and as you say old) copy of qemu-linaro. qemu-linaro proper is here: http://git.linaro.org/gitweb?p=qemu/qemu-linaro.git;a=shortlog;h=refs/heads/rebasing I should have known better, thanks :-) My proposal for how we should handle SMC is here: http://lists.gnu.org/archive/html/qemu-devel/2012-05/msg03012.html (the code in qemu-linaro for SMC is similar to that but not exactly the same semantics). If I correctly understood your proposal, it would require some ARM code to simulate the behaviour of the secure ROM, right? IMHO that'd be much more complex than what is needed to solve Itaru Kitayama issue. In fact, I wonder if considering all SMC calls as NOPs wouldn't be enough to run non secure kernels using that feature (of course, one would still need some of the secure cp15 registers, but that's not a very difficult task I guess). Laurent
Re: [Qemu-devel] Adding a parameter to a helper
On Mon, Jul 30, 2012 at 6:40 PM, Jose Cano Reyes jc...@ac.upc.edu wrote: I am trying to add a new integer parameter to an existing helper and call this helper in targeti386/translate.c. I have several problems: 1) I cannot add an integer parameter to the helper, the compiler says that it must be TCGv_i32, despite I declare this new parameter as int in target-i386/helper.h. Why? Helpers only accept TCGv parameters. 2) If I use the the function tcg_const_i32 in order to convert my integer to TCGv_i32 I always obtain the same output value, that is: tcg_const_i32(10) = 1074260520 tcg_const_i32(22) = 1074260520 tcg_const_i32(30) = 1074260520 ... TCGv is an index, not the value it represents. Think of it as an id. tcg_const will allocate a TCGv and then emit a TCG mov instruction to assign it a value. 3) Moreover, wen I pass this value in the helper call gen_helper_flds_ST0, that is: gen_helper_flds_ST0(cpu_tmp2_i32, tcg_const_i32(MY_INT_VALUE)); How can I use MY_INT_VALUE later in the function tcg_gen_helperN . This function is called by DEF_HELPER_FLAGS2, which corresponds to DEF_HELPER_2 (definition of my helper). Look at helper_aam, that should help :-) Laurent
Re: [Qemu-devel] Adding a parameter to a helper
On Tue, Jul 31, 2012 at 5:09 PM, Jose Cano Reyes jc...@ac.upc.edu wrote: - So, how ca I obtain the value that TCGv_i32 represents? In the generated code (that is after TCG is translated to host machine code), you'll et your value in your helper. If you mean before running the helper, then it's much more complex and would require to process the TCG code. - I don't understand well how a helper functions. For instance, cosidering this call to a helper again: gen_helper_flds_ST0(cpu_tmp2_i32, tcg_const_i32(MY_INT_VALUE)) Can I obtain the parameters cpu_tmp2_i32 and tcg_const_i32(MY_INT_VALUE) from the args[0] and args[1] described in DEF_HELPER_FLAGS_2??? cf. above. Again look at existing helpers and how they get const values. Laurent
Re: [Qemu-devel] Using QEMU for gcc testing
On Thu, Jul 19, 2012 at 5:29 PM, Michael Eager ea...@eagerm.com wrote: I'm interested in using QEMU to test gcc for a processor. This is a hard-metal target -- there is no operating system. Can anyone make suggestions on how to do this? You could look at how QEMU implements semihosting for ARM. And also how gcc and associated libraries are using semihosting for ARM. You could then use something similar for your processor. HTH, Laurent
Re: [Qemu-devel] [RFC][PATCH v2 4/4] configure: add CONFIG_QEMU_LDST_OPTIMIZATION for TCG qemu_ld/st optimization
On Thu, Jul 5, 2012 at 4:06 PM, Peter Maydell peter.mayd...@linaro.org wrote: [...] case $target_arch2 in alpha | sparc* | xtensa* | ppc*) echo CONFIG_TCG_PASS_AREG0=y $config_target_mak +# qemu_ld/st optimization is not available with CONFIG_TCG_PASS_AREG0 +target_ldst_optimization=no PASS_AREG0 is the way of the future -- you need to fix the ldst optimization to work with it. Agreed. But what's the point of speeding up on one side and losing speed on the other side? AREG0 slowdown would be acceptable if the ARM target was using less helpers, until this happens I don't think it is a good idea to push AREG0, hence I'm not sure it is a prerequisite that Yeongkyoon Lee's patch supports it. By the way, it's also a good reason to have CONFIG_QEMU_LDST_OPTIMIZATION, which you commented on in patch 3. Basically, and in my humble opinion, this is not a good enough reason to reject the patch :-) Of course the support should be added as soon as possible once the rest has been discussed. Laurent
Re: [Qemu-devel] How to measure guest memory access (qemu_ld/qemu_st) time?
On Fri, Jun 15, 2012 at 12:30 AM, Lluís Vilanova vilan...@ac.upc.edu wrote: [...] Now that I think of it, you will have problems generating code to surround each qemu_ld/st with a lightweight mechanism to get the time. In x86 it would be rdtsc, but you want to generate a host rdtsc instruction inside the code generated by QEMU's TCG, so you should also have to hack TCG (or the code generation pointers) to issue an rdtsc instruction. Even rdtsc would introduce enough noise that it wouldn't be reliable for such a micro measurement: as far as I understand it, this instruction can be reordered, so you need to flush the pipeline before issuing it. Intel has a document about that: download.intel.com/embedded/software/IA/324264.pdf The overhead of their proposed method is so high that it's likely it would take longer than the execution of the fast path itself. IMHO a mix of YeongKyoon Lee way to count ld/st and comparison between user mode and softmmu still seems to be the best approach (well unless you have access to a cycle accurate simulator :-). Laurent
Re: [Qemu-devel] How to measure guest memory access (qemu_ld/qemu_st) time?
On Wed, Jun 13, 2012 at 5:14 AM, 陳韋任 (Wei-Ren Chen) che...@iis.sinica.edu.tw wrote: Hi all, I suspect that guest memory access (qemu_ld/qemu_st) account for the major of time spent in system mode. I would like to know precisely how much (if possible). We use tools like perf [1] before, but since the logic of guest memory access aslo embedded in the host binary not only helper functions, the result cannot be relied. The current idea is adding helper functions before/after guest memory access logic. Take ARM guest on x86_64 host for example, should I add the helper functions before/after tcg_gen_qemu_{ld,st} in target-arm/translate.c or tcg_out_qemu_{ld,st} in tcg/i386/tcg-target.c? Or there is a better way to know how much time QEMU spend on handling guest memory access? I'm afraid there's no easy way to measure that: any change you make to generated code will completely change the timing given that the ld/st fast path is only a few instructions long. Another approach might be to run the program in user mode and then in system mode (provided the guest OS is very light). As a side note, it might be interesting to gather statistics about the hit rate of the QEMU TLB. Another thing to consider is speeding up the fast path; see YeongKyoon Lee RFC patch: http://www.mail-archive.com/qemu-devel@nongnu.org/msg91294.html Laurent
Re: [Qemu-devel] ARM: Virtual / Physical address translation
On Thu, May 31, 2012 at 9:07 PM, Ira Ray Jenkins irarayjenk...@gmail.com wrote: On Wed, May 30, 2012 at 10:30 AM, Laurent Desnogues laurent.desnog...@gmail.com wrote: On Wed, May 30, 2012 at 3:20 AM, Peter Maydell peter.mayd...@linaro.org wrote: On 30 May 2012 02:00, Ira Ray Jenkins irarayjenk...@gmail.com wrote: What I would like is to be able to get the physical addresses of both data and instructions. Can anyone help me work through how to get the properly translated physical addresses given the virtual address? See the function get_phys_addr() in target-arm/helper.c ... That is a private function but if you're doing a local hack you can wire it up to what you need it for. Using that function directly is not that great an idea as it has side effects on the environment. IMHO the best is to duplicate it and remove the side effects (which is what I did for my cache simulator). BTW Edgar Iglesias has implemented a cache simulator in QEMU. I just can't remember where the repository is... Laurent Would I also need to duplicate get_phys_addr_mpu/v5/v6 ? Depending on the target CPU, you'd have to duplicate one of them. The side effects you mentioned, are these in the above functions? I was wrong in my previous mail: the changes to env are done in cpu_arm_handle_mmu_fault which is the entry point to handle VA/PA translation from generated code. Since I'm really only interested in the physical address - phys_ptr - , can I ignore protection page size? What about access type and user? You don't need to return these values. But be careful about translations that fault: if you insert your helper call before the code for the emulated ld/st, you can get faults and in this case you probably don't want to log that memory access. Edgar's work was for the cris target, so I'm unsure if it is different for arm. The way it's done should be similar. Basically, since I am just doing a memory trace dump for arm target, I just want a simple translation from virtual address to physical, unobtrusively - without modify the state/env. I'm not sure how to modify get_phys_addr*() to do this. Any help would be great. I hope you have enough information now. Laurent
Re: [Qemu-devel] ARM: Virtual / Physical address translation
On Wed, May 30, 2012 at 3:20 AM, Peter Maydell peter.mayd...@linaro.org wrote: On 30 May 2012 02:00, Ira Ray Jenkins irarayjenk...@gmail.com wrote: What I would like is to be able to get the physical addresses of both data and instructions. Can anyone help me work through how to get the properly translated physical addresses given the virtual address? See the function get_phys_addr() in target-arm/helper.c ... That is a private function but if you're doing a local hack you can wire it up to what you need it for. Using that function directly is not that great an idea as it has side effects on the environment. IMHO the best is to duplicate it and remove the side effects (which is what I did for my cache simulator). BTW Edgar Iglesias has implemented a cache simulator in QEMU. I just can't remember where the repository is... Laurent
Re: [Qemu-devel] [PATCH 08/10] qapi: add strict mode to input visitor
Hello, On Thu, Mar 22, 2012 at 12:51 PM, Paolo Bonzini pbonz...@redhat.com wrote: While QMP in general is designed so that it is possible to ignore unknown arguments, in the case of the QMP server it is better to reject them to detect bad clients. In fact, we're already doing this at the top level in the argument checker. To extend this to complex structures, add a mode to the input visitor where it checks for unvisited keys and raises an error if it finds one. Signed-off-by: Paolo Bonzini pbonz...@redhat.com --- qapi/qmp-input-visitor.c | 48 +- qapi/qmp-input-visitor.h | 2 + test-qmp-input-strict.c | 234 ++ tests/Makefile | 5 +- 4 files changed, 285 insertions(+), 4 deletions(-) create mode 100644 test-qmp-input-strict.c diff --git a/qapi/qmp-input-visitor.c b/qapi/qmp-input-visitor.c index 97a0186..eb09874 100644 --- a/qapi/qmp-input-visitor.c +++ b/qapi/qmp-input-visitor.c @@ -24,6 +24,7 @@ typedef struct StackObject { QObject *obj; const QListEntry *entry; + GHashTable *h; } StackObject; struct QmpInputVisitor @@ -31,6 +32,7 @@ struct QmpInputVisitor Visitor visitor; StackObject stack[QIV_STACK_SIZE]; int nb_stack; + bool strict; }; static QmpInputVisitor *to_qiv(Visitor *v) @@ -45,6 +47,9 @@ static QObject *qmp_input_get_object(QmpInputVisitor *qiv, if (qobj) { if (name qobject_type(qobj) == QTYPE_QDICT) { + if (qiv-stack[qiv-nb_stack - 1].h) { + g_hash_table_remove(qiv-stack[qiv-nb_stack - 1].h, name); + } return qdict_get(qobject_to_qdict(qobj), name); } else if (qiv-stack[qiv-nb_stack - 1].entry) { return qlist_entry_obj(qiv-stack[qiv-nb_stack - 1].entry); @@ -54,20 +59,47 @@ static QObject *qmp_input_get_object(QmpInputVisitor *qiv, return qobj; } +static void qdict_add_key(const char *key, QObject *obj, void *opaque) +{ + GHashTable *h = opaque; + g_hash_table_insert(h, (gpointer) key, NULL); +} + static void qmp_input_push(QmpInputVisitor *qiv, QObject *obj, Error **errp) { - qiv-stack[qiv-nb_stack].obj = obj; - qiv-stack[qiv-nb_stack].entry = NULL; - qiv-nb_stack++; + GHashTable *h; if (qiv-nb_stack = QIV_STACK_SIZE) { error_set(errp, QERR_BUFFER_OVERRUN); return; } + + qiv-stack[qiv-nb_stack].obj = obj; + qiv-stack[qiv-nb_stack].entry = NULL; + qiv-stack[qiv-nb_stack].h = NULL; + + if (qiv-strict qobject_type(obj) == QTYPE_QDICT) { + h = g_hash_table_new(g_str_hash, g_str_equal); + qdict_iter(qobject_to_qdict(obj), qdict_add_key, h); + qiv-stack[qiv-nb_stack].h = h; + } + + qiv-nb_stack++; } static void qmp_input_pop(QmpInputVisitor *qiv, Error **errp) { + GHashTableIter iter; GHashTableIter is alas not available in the glib (2.12) that the distros we use at work run. Is there a workaround for this issue? Thanks, Laurent + gpointer key; + + if (qiv-strict qiv-stack[qiv-nb_stack - 1].h) { + g_hash_table_iter_init(iter, qiv-stack[qiv-nb_stack - 1].h); + if (g_hash_table_iter_next(iter, key, NULL)) { + error_set(errp, QERR_QMP_EXTRA_MEMBER, (char *) key); + } + g_hash_table_unref(qiv-stack[qiv-nb_stack - 1].h); + } + assert(qiv-nb_stack 0); qiv-nb_stack--; } @@ -257,3 +289,13 @@ QmpInputVisitor *qmp_input_visitor_new(QObject *obj) return v; } + +QmpInputVisitor *qmp_input_visitor_new_strict(QObject *obj) +{ + QmpInputVisitor *v; + + v = qmp_input_visitor_new(obj); + v-strict = true; + + return v; +} diff --git a/qapi/qmp-input-visitor.h b/qapi/qmp-input-visitor.h index 3f798f0..e0a48a5 100644 --- a/qapi/qmp-input-visitor.h +++ b/qapi/qmp-input-visitor.h @@ -20,6 +20,8 @@ typedef struct QmpInputVisitor QmpInputVisitor; QmpInputVisitor *qmp_input_visitor_new(QObject *obj); +QmpInputVisitor *qmp_input_visitor_new_strict(QObject *obj); + void qmp_input_visitor_cleanup(QmpInputVisitor *v); Visitor *qmp_input_get_visitor(QmpInputVisitor *v); diff --git a/test-qmp-input-strict.c b/test-qmp-input-strict.c new file mode 100644 index 000..f6df8cb --- /dev/null +++ b/test-qmp-input-strict.c @@ -0,0 +1,234 @@ +/* + * QMP Input Visitor unit-tests (strict mode). + * + * Copyright (C) 2011-2012 Red Hat Inc. + * + * Authors: + * Luiz Capitulino lcapitul...@redhat.com + * Paolo Bonzini pbonz...@redhat.com + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include glib.h +#include stdarg.h + +#include qapi/qmp-input-visitor.h +#include test-qapi-types.h +#include test-qapi-visit.h +#include qemu-objects.h + +typedef struct TestInputVisitorData { + QObject *obj; + QmpInputVisitor *qiv; +}
Re: [Qemu-devel] [PATCH v2 0/6] ARM: AREG0 conversion
On Tue, Mar 27, 2012 at 9:59 PM, Artyom Tarasenko atar4q...@gmail.com wrote: On Tue, Mar 27, 2012 at 7:01 PM, Laurent Desnogues laurent.desnog...@gmail.com wrote: On Tue, Mar 27, 2012 at 6:48 PM, Blue Swirl blauwir...@gmail.com wrote: On Tue, Mar 27, 2012 at 13:40, Laurent Desnogues laurent.desnog...@gmail.com wrote: On Mon, Mar 26, 2012 at 7:02 PM, Blue Swirl blauwir...@gmail.com wrote: [...] At least stack protector is protecting more code than before (for example TLB miss handler), but could overhead from that amount to 5%? Otherwise there should be just a few extra register moves here and there, that should be cheap on modern processors. The extra moves might be cheap but their cost is obviously not 0: on top of using extra CPU core resources, code size is increased which results in more instruction cache misses. I didn't like the idea when we discussed it back in May, now it looks like we have concrete evidence the speed impact is measurable (though I'd like some more numbers than the rough 5% estimate I gave). A clearly defined test case running on a host that does not adjust clock frequencies would be nice. It would be interesting to find out where exactly the slowdown comes from. Perhaps the access helpers ({helper,_}_{ld,st}{b,w,l}_mmu) generated by softmmu_template.h are the culprit. If so, they could be split from other code and moved to TCG back ends. That way the interface could be improved while keeping all other cleanups. I also get a slowdown running in user mode, so I don't think improving the mmu ld/st will completely remove the issue. In that case the slowdown comes from the extra move instructions for helper calls. The ARM target uses way too many helpers, but that's another discussion :-) Have you tried compiling without -fstack-protector-all as suggested by Lluís? I observe a similar slowdown on a sparc target, and there compiling without stack protection definitely helps. That will indeed probably make the real problem, which is that this patch increases the size of generated code, less obvious on small benchmarks that don't put pressure on instruction cache. But the fact is that generated code is larger and will have to execute more instructions, so no matter what you do, this will have an impact on speed. Laurent
Re: [Qemu-devel] [PATCH v2 0/6] ARM: AREG0 conversion
On Mon, Mar 26, 2012 at 7:02 PM, Blue Swirl blauwir...@gmail.com wrote: [...] At least stack protector is protecting more code than before (for example TLB miss handler), but could overhead from that amount to 5%? Otherwise there should be just a few extra register moves here and there, that should be cheap on modern processors. The extra moves might be cheap but their cost is obviously not 0: on top of using extra CPU core resources, code size is increased which results in more instruction cache misses. I didn't like the idea when we discussed it back in May, now it looks like we have concrete evidence the speed impact is measurable (though I'd like some more numbers than the rough 5% estimate I gave). Laurent
Re: [Qemu-devel] [PATCH v2 0/6] ARM: AREG0 conversion
On Tue, Mar 27, 2012 at 6:48 PM, Blue Swirl blauwir...@gmail.com wrote: On Tue, Mar 27, 2012 at 13:40, Laurent Desnogues laurent.desnog...@gmail.com wrote: On Mon, Mar 26, 2012 at 7:02 PM, Blue Swirl blauwir...@gmail.com wrote: [...] At least stack protector is protecting more code than before (for example TLB miss handler), but could overhead from that amount to 5%? Otherwise there should be just a few extra register moves here and there, that should be cheap on modern processors. The extra moves might be cheap but their cost is obviously not 0: on top of using extra CPU core resources, code size is increased which results in more instruction cache misses. I didn't like the idea when we discussed it back in May, now it looks like we have concrete evidence the speed impact is measurable (though I'd like some more numbers than the rough 5% estimate I gave). A clearly defined test case running on a host that does not adjust clock frequencies would be nice. It would be interesting to find out where exactly the slowdown comes from. Perhaps the access helpers ({helper,_}_{ld,st}{b,w,l}_mmu) generated by softmmu_template.h are the culprit. If so, they could be split from other code and moved to TCG back ends. That way the interface could be improved while keeping all other cleanups. I also get a slowdown running in user mode, so I don't think improving the mmu ld/st will completely remove the issue. In that case the slowdown comes from the extra move instructions for helper calls. The ARM target uses way too many helpers, but that's another discussion :-) Laurent
Re: [Qemu-devel] [PATCH 1/6] arm: move neon_tbl to neon_helper.c
On Mon, Mar 19, 2012 at 11:10 PM, Peter Maydell peter.mayd...@linaro.org wrote: On 19 March 2012 21:56, Blue Swirl blauwir...@gmail.com wrote: -DEF_HELPER_4(neon_tbl, i32, i32, i32, i32, i32) +DEF_HELPER_5(neon_tbl, i32, env, i32, i32, i32, i32) --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -6340,7 +6340,7 @@ static int disas_neon_data_insn(CPUARMState * env, DisasContext *s, uint32_t ins tmp2 = neon_load_reg(rm, 0); tmp4 = tcg_const_i32(rn); tmp5 = tcg_const_i32(n); - gen_helper_neon_tbl(tmp2, tmp2, tmp, tmp4, tmp5); + gen_helper_neon_tbl(cpu_env, tmp2, tmp2, tmp, tmp4, tmp5); tcg_temp_free_i32(tmp); if (insn (1 6)) { tmp = neon_load_reg(rd, 1); @@ -6349,7 +6349,7 @@ static int disas_neon_data_insn(CPUARMState * env, DisasContext *s, uint32_t ins tcg_gen_movi_i32(tmp, 0); } tmp3 = neon_load_reg(rm, 1); - gen_helper_neon_tbl(tmp3, tmp3, tmp, tmp4, tmp5); + gen_helper_neon_tbl(cpu_env, tmp3, tmp3, tmp, tmp4, tmp5); tcg_temp_free_i32(tmp5); tcg_temp_free_i32(tmp4); neon_store_reg(rd, 0, tmp2); ...shouldn't these be gen_helper_neon_tbl(tmp3, cpu_env, tmp3, tmp, tmp4, tmp5); Indeed. Compiling with --enable-debug doesn't work. Laurent
Re: [Qemu-devel] [PATCH 0/6] ARM: AREG0 conversion
On Mon, Mar 19, 2012 at 10:55 PM, Blue Swirl blauwir...@gmail.com wrote: Convert ARM to AREG0 free operation. Survives simple tests. After fixing the issue about tbl helper usage, I could run some simple linux-user tests and boot a rather large Linux image. It looks like the kernel boot is about 5% slower. Laurent URL git://repo.or.cz/qemu/blueswirl.git http://repo.or.cz/r/qemu/blueswirl.git Blue Swirl (6): arm: move neon_tbl to neon_helper.c arm: move saturating arithmetic to helper.c arm: move other arithmetic to helper.c arm: move cpsr and banked register access to helper.c arm: move exception and wfi helpers to helper.c arm: move load and store helpers, switch to AREG0 free mode Makefile.target | 4 +- configure | 2 +- target-arm/helper.c | 388 +- target-arm/helper.h | 60 target-arm/neon_helper.c | 22 +++ target-arm/op_helper.c | 430 -- target-arm/translate.c | 148 7 files changed, 513 insertions(+), 541 deletions(-) delete mode 100644 target-arm/op_helper.c -- 1.7.9
Re: [Qemu-devel] [PATCH] target-arm: Decode SETEND correctly in Thumb
On Tue, Mar 13, 2012 at 3:45 PM, Peter Maydell peter.mayd...@linaro.org wrote: Decode the SETEND instruction correctly in Thumb mode, rather than accidentally treating it like CPS. We don't support BE8 mode, but this change brings the Thumb mode in to line with behaviour in ARM mode: 'SETEND BE' is not supported and will provoke an UNDEF exception, but 'SETEND LE' is correctly handled as a no-op. Signed-off-by: Peter Maydell peter.mayd...@linaro.org Reported-by: Daniel Forsgren daniel.forsg...@enea.com The patch looks good to me; I guess this would qualify for a: Reviewed-by: Laurent Desnogues laurent.desnog...@gmail.com The only point I dislike isn't directly related to this patch: the use of illegal_op that behaves exactly as UNDEF looks odd. Laurent --- This is one of those patches where I wasn't sure whether to try to split it into a whitespace-only part and a significant-change part. Most of this is (a) indenting existing code another notch for the extra switch statement and (b) adding braces to placate checkpatch. target-arm/translate.c | 63 ++- 1 files changed, 40 insertions(+), 23 deletions(-) diff --git a/target-arm/translate.c b/target-arm/translate.c index 280bfca..3196619 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -9704,32 +9704,49 @@ static void disas_thumb_insn(CPUState *env, DisasContext *s) store_reg(s, rd, tmp); break; - case 6: /* cps */ - ARCH(6); - if (IS_USER(s)) + case 6: + switch ((insn 5) 7) { + case 2: + /* setend */ + ARCH(6); + if (insn (1 3)) { + /* BE8 mode not implemented. */ + goto illegal_op; + } break; - if (IS_M(env)) { - tmp = tcg_const_i32((insn (1 4)) != 0); - /* FAULTMASK */ - if (insn 1) { - addr = tcg_const_i32(19); - gen_helper_v7m_msr(cpu_env, addr, tmp); - tcg_temp_free_i32(addr); + case 3: + /* cps */ + ARCH(6); + if (IS_USER(s)) { + break; } - /* PRIMASK */ - if (insn 2) { - addr = tcg_const_i32(16); - gen_helper_v7m_msr(cpu_env, addr, tmp); - tcg_temp_free_i32(addr); + if (IS_M(env)) { + tmp = tcg_const_i32((insn (1 4)) != 0); + /* FAULTMASK */ + if (insn 1) { + addr = tcg_const_i32(19); + gen_helper_v7m_msr(cpu_env, addr, tmp); + tcg_temp_free_i32(addr); + } + /* PRIMASK */ + if (insn 2) { + addr = tcg_const_i32(16); + gen_helper_v7m_msr(cpu_env, addr, tmp); + tcg_temp_free_i32(addr); + } + tcg_temp_free_i32(tmp); + gen_lookup_tb(s); + } else { + if (insn (1 4)) { + shift = CPSR_A | CPSR_I | CPSR_F; + } else { + shift = 0; + } + gen_set_psr_im(s, ((insn 7) 6), 0, shift); } - tcg_temp_free_i32(tmp); - gen_lookup_tb(s); - } else { - if (insn (1 4)) - shift = CPSR_A | CPSR_I | CPSR_F; - else - shift = 0; - gen_set_psr_im(s, ((insn 7) 6), 0, shift); + break; + default: + goto undef; } break; -- 1.7.1
Re: [Qemu-devel] [MASCOT CONTEST] Alex Bradbury #1
On Wed, Feb 15, 2012 at 3:31 PM, Anthony Liguori anth...@codemonkey.ws wrote: Please respond to this note with an '+1', or an Ack, to vote for this icon. Ack Laurent
Re: [Qemu-devel] profiling qemu
2012/2/14 Lluís Vilanova vilan...@ac.upc.edu: Artyom Tarasenko writes: [...] Here it looks like compute_all_sub and compute_all_sub_xcc are good candidates for optimizing: together they take the same amount of time as cpu_sparc_exec. I guess both operations would be trivial in the x86_64 assembler. What would be the best strategy to make TCG take the advantage of running on a x86_64 host? A quick look into the code reveals that these two are called from a TCG helper (helper_compute_psr), so I see two approaches here applicable to the most frequently used sub-operations in helper_compute_psr: * Define new simpler helpers for those sub-operations that can be declared with TCG_CALL_CONST and generate the new psr/xcc values in temporal registers. You must make sure any other code will still be able to use the new psr/xcc values. * Reimplement these sub-operations in pure TCG code. But first, make sure you run a proper benchmark to establish where are the hotspots in the sparc code for QEMU. The problem here is to establish what a proper benchmark is :) Similar helpers are used in ARM translation, so I'm not surprised they show up (typically sub/flag instructions are used for loops). A good strategy is indeed to generate TCG code and let the NZ/C/etc. be global temps as other CPU registers. This gains a few percents of speed. HTH, Laurent
Re: [Qemu-devel] profiling qemu
On Tue, Feb 14, 2012 at 4:15 PM, Artyom Tarasenko atar4q...@gmail.com wrote: 2012/2/14 Laurent Desnogues laurent.desnog...@gmail.com: 2012/2/14 Lluís Vilanova vilan...@ac.upc.edu: Artyom Tarasenko writes: [...] Here it looks like compute_all_sub and compute_all_sub_xcc are good candidates for optimizing: together they take the same amount of time as cpu_sparc_exec. I guess both operations would be trivial in the x86_64 assembler. What would be the best strategy to make TCG take the advantage of running on a x86_64 host? A quick look into the code reveals that these two are called from a TCG helper (helper_compute_psr), so I see two approaches here applicable to the most frequently used sub-operations in helper_compute_psr: * Define new simpler helpers for those sub-operations that can be declared with TCG_CALL_CONST and generate the new psr/xcc values in temporal registers. You must make sure any other code will still be able to use the new psr/xcc values. * Reimplement these sub-operations in pure TCG code. But first, make sure you run a proper benchmark to establish where are the hotspots in the sparc code for QEMU. The problem here is to establish what a proper benchmark is :) Similar helpers are used in ARM translation, so I'm not surprised they show up (typically sub/flag instructions are used for loops). A good strategy is indeed to generate TCG code and let the NZ/C/etc. be global temps as other CPU registers. This gains a few percents of speed. Can you give an example, where global temp would be faster than an inline helper? At the first sight it's trading a cheap math operation (in case of sub, a few cheap math operations in case of subx) against a memory access. Or do you mean, use the global flag registers instead of CC_SRC{1,2} and always compute them? I mean that for my work on ARM I just declared the NZ/C/V fields as other registers (tcg_global_mem_new) and let the TCG optimizer do the work. I can't say if that would play well with the SPARC front-end that mimics the way x86 does flag handling. (I would argue that on x86 you have no choice because most instructions do touch flags, while it's not true on SPARC, IIRC.) I am afraid you'll have to try yourself and see if you gain something :-) Laurent
Re: [Qemu-devel] [PATCH 01/19] linux-user: stack_base is now mandatory on all targets
On Fri, Feb 3, 2012 at 3:49 PM, riku.voi...@linaro.org wrote: From: Riku Voipio riku.voi...@linaro.org Signed-off-by: Riku Voipio riku.voi...@linaro.org --- linux-user/qemu.h | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/linux-user/qemu.h b/linux-user/qemu.h index 55ad9d8..30e2abd 100644 --- a/linux-user/qemu.h +++ b/linux-user/qemu.h @@ -123,10 +123,10 @@ typedef struct TaskState { #endif #if defined(TARGET_ARM) || defined(TARGET_M68K) || defined(TARGET_UNICORE32) /* Extra fields for semihosted binaries. */ - uint32_t stack_base; uint32_t heap_base; uint32_t heap_limit; #endif + uint32_t stack_base; Shouldn't this be abi_ulong instead of uint32_t? int used; /* non zero if used */ struct image_info *info; struct linux_binprm *bprm; Laurent
Re: [Qemu-devel] icount and tb chaining
On Tue, Jan 17, 2012 at 7:50 PM, Peter Maydell peter.mayd...@linaro.org wrote: 2012/1/13 James Greensky gsk...@gmail.com: Sure, usually a tb chain is setup after a subsequent tb is found/constructed in the loop in cpu_exec when a tb returns. Taken/non-taken branch chaining is implemented by indicating the branch direction by the two least significant digits of the the previously returned tb. This is usually 0/1. When running icount, you can also get a 2 value in these least significant digits, indicating that the translation block was restarted due to the icount_decr.u16.low field being exhausted but having instructions left to execute in icount_extra. This 2 value falls through to tb_add_jump, which then updates the tb's jmp_first field, as both tb and next_tb refer to the same translation block. My question is why is this necessary, why not do nothing, and leave the previous chaining intact? This looks suspiciously like a bug to me, although the code is a bit opaque to me. Calling tb_add_jump() with n==2 seems like it's not going to work since tb_set_jmp_target() is going to fall off the end of either tb_jmp_offset[] or tb_next[], so even if we're playing clever games with jmp_first being treatable as jmp_next[2] for some purposes there's going to be a problem. I thought the 2 return from a TB execution was only used as a flag value, which would suggest that we should clear next_tb in the 'refill decrementer' code path too. I'm not sure I'm not missing something subtle, though. Hmm I wonder if I didn't miss something in the discussion because the only call to tb_add_jmp clears the lower two bits so I fail to see the issue. Laurent
Re: [Qemu-devel] icount and tb chaining
On Tue, Jan 17, 2012 at 8:06 PM, Laurent Desnogues laurent.desnog...@gmail.com wrote: On Tue, Jan 17, 2012 at 7:50 PM, Peter Maydell peter.mayd...@linaro.org wrote: 2012/1/13 James Greensky gsk...@gmail.com: Sure, usually a tb chain is setup after a subsequent tb is found/constructed in the loop in cpu_exec when a tb returns. Taken/non-taken branch chaining is implemented by indicating the branch direction by the two least significant digits of the the previously returned tb. This is usually 0/1. When running icount, you can also get a 2 value in these least significant digits, indicating that the translation block was restarted due to the icount_decr.u16.low field being exhausted but having instructions left to execute in icount_extra. This 2 value falls through to tb_add_jump, which then updates the tb's jmp_first field, as both tb and next_tb refer to the same translation block. My question is why is this necessary, why not do nothing, and leave the previous chaining intact? This looks suspiciously like a bug to me, although the code is a bit opaque to me. Calling tb_add_jump() with n==2 seems like it's not going to work since tb_set_jmp_target() is going to fall off the end of either tb_jmp_offset[] or tb_next[], so even if we're playing clever games with jmp_first being treatable as jmp_next[2] for some purposes there's going to be a problem. I thought the 2 return from a TB execution was only used as a flag value, which would suggest that we should clear next_tb in the 'refill decrementer' code path too. I'm not sure I'm not missing something subtle, though. Hmm I wonder if I didn't miss something in the discussion because the only call to tb_add_jmp clears the lower two bits so I fail to see the issue. Hmm I knew I was missing something, forget this :-) Laurent
Re: [Qemu-devel] [PATCH] target-i386: Fix regression with maxsd SSE2 instruction
On Tue, Nov 8, 2011 at 2:00 PM, Jason Wessel jason.wes...@windriver.com wrote: The maxsd instruction needs to take into account the sign of the numbers 64 bit numbers. This is a regression that was introduced in 347ac8e356 (target-i386: switch to softfloat). The case that fails is: maxsd %xmm1,%xmm0 When xmm1 = 24 and xmm0 = -100 This was found running the glib2 binding tests where it prints the message: /binding/transform: GLib-GObject-WARNING **: value 24.00 of type `gdouble' is invalid or out of range for property `value' of type `gdouble' aborting... Using a signed comparison fixes the problem. Signed-off-by: Jason Wessel jason.wes...@windriver.com --- target-i386/ops_sse.h | 4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/target-i386/ops_sse.h b/target-i386/ops_sse.h index aa41d25..bcc0ed9 100644 --- a/target-i386/ops_sse.h +++ b/target-i386/ops_sse.h @@ -584,8 +584,8 @@ void helper_ ## name ## sd (Reg *d, Reg *s)\ #define FPU_SUB(size, a, b) float ## size ## _sub(a, b, env-sse_status) #define FPU_MUL(size, a, b) float ## size ## _mul(a, b, env-sse_status) #define FPU_DIV(size, a, b) float ## size ## _div(a, b, env-sse_status) -#define FPU_MIN(size, a, b) (a) (b) ? (a) : (b) -#define FPU_MAX(size, a, b) (a) (b) ? (a) : (b) Isn't maxsd a floating-point instruction? If so, shouldn't FPU_{MIN,MAX} use softfloat operations? Laurent +#define FPU_MIN(size, a, b) (int ## size ## _t)(a) (int ## size ## _t)(b) ? (a) : (b) +#define FPU_MAX(size, a, b) (int ## size ## _t)(a) (int ## size ## _t)(b) ? (a) : (b) #define FPU_SQRT(size, a, b) float ## size ## _sqrt(b, env-sse_status) SSE_HELPER_S(add, FPU_ADD) -- 1.7.1
Re: [Qemu-devel] [PATCH] tcg: Remove stack protection from helper functions
On Mon, Sep 26, 2011 at 10:01 AM, Mulyadi Santosa mulyadi.sant...@gmail.com wrote: Hi... On Mon, Sep 26, 2011 at 14:46, Jan Kiszka jan.kis...@siemens.com wrote: This increases the overhead of frequently executed helpers. Signed-off-by: Jan Kiszka jan.kis...@siemens.com IMHO, stack protector setup put more stuffs during epilogue, but quite likely it is negligible unless it cause too much L1 cache misses. So, I think this micro tuning is somewhat unnecessary but still okay. The impact of stack protection is very high for instance running FFmpeg ARM with NEON optimizations: a few months ago I measured that removing stack protection improved the run time by more than 10%. Of course it's extreme since the proportion of NEON instructions (and hence of helper calls) is very high. Laurent
Re: [Qemu-devel] [HelenOS-devel] [sparc64] Miscomputed minimum of a group of numbers in sparc64 emulation
On Fri, Jul 1, 2011 at 2:57 PM, Jakub Jermar ja...@jermar.eu wrote: [...] When _not_ singlestepping via GDB's `stepi`, the testcase will fail and crash Qemu like this: qemu: fatal: Trap 0x0101 while trap level (5) = MAXTL (5), Error state register dumped here On the other hand, when I attach GDB to Qemu and step through all instructions using `stepi`, the testcase will succeed and crash Qemu like this: qemu: fatal: Trap 0x0100 while trap level (5) = MAXTL (5), Error state registers dumped here Mind the difference in the trap type - 0x100 for success, 0x101 for failure. This is how I run the test: Without GDB: $ qemu-system-sparc64 -bios ./testcase With GDB: $ qemu-system-sparc64 -bios ./testcase -s -S From another terminal: $ /usr/local/cross/sparc64/bin/sparc64-linux-gnu-gdb (gdb) set architecture sparc:v9 (gdb) target remote localhost:1234 (gdb) stepi ... Hope this helps to fix the problem. You don't have to use gdb to reproduce the issue, just add -singlestep when running qemu. I find it odd that udivx is using cpu_cc_src and cpu_cc_src2. Using dedicated local temps seems to fix the issue. HTH, Laurent
Re: [Qemu-devel] [HelenOS-devel] [sparc64] Miscomputed minimum of a group of numbers in sparc64 emulation
On Fri, Jul 1, 2011 at 4:11 PM, Jakub Jermar ja...@jermar.eu wrote: [...] Actually, the testcase can be further reduced into: .global _start .text .space 0x20 _start: set 110393, %i1 set 0x40, %i2 cmp %i1, %i2 udivx %g0, 1, %g0 movgu %xcc, %i2, %i1 cmp %i1, 512 bgu %xcc, 0f nop succ: ta 0 fail: 0: ta 1 The presence of the `udivx` instruction seems to be essential. Even though it has no effect on the computation, removing it will make the testcase non-reproducible. Could you try to replace udivx with sdivx? It looks wrong too. Laurent
Re: [Qemu-devel] [HelenOS-devel] [sparc64] Miscomputed minimum of a group of numbers in sparc64 emulation
On Fri, Jul 1, 2011 at 5:03 PM, Artyom Tarasenko atar4q...@gmail.com wrote: [...] I find it odd that udivx is using cpu_cc_src and cpu_cc_src2. Using dedicated local temps seems to fix the issue. Do we need to copy cpu_src* to further temps at all? IMHO - tcg_gen_mov_tl(cpu_cc_src, cpu_src1); - tcg_gen_mov_tl(cpu_cc_src2, cpu_src2); - gen_trap_ifdivzero_tl(cpu_cc_src2); - tcg_gen_divu_i64(cpu_dst, cpu_cc_src, cpu_cc_src2); + gen_trap_ifdivzero_tl(cpu_src2); + tcg_gen_divu_i64(cpu_dst, cpu_src1, cpu_src2); should do it. Or cpu_src is what you mean by dedicated? You have to use two local temps here to store cpu_src1 and cpu_src2 because gen_trap_ifdivzero_tl uses tcg_gen_brcondi_tl (cf tcg/README comment about local usage and jumps). Note you'll have to do something similar in gen_op_sdivx. Laurent
Re: [Qemu-devel] [PATCH][sparc64] fix cpu_cc_src and cpu_cc_src2 corruption in udivx and sdivx
On Fri, Jul 1, 2011 at 9:28 PM, Artyom Tarasenko atar4q...@gmail.com wrote: udivx and sdvix don't modify condition flags, so they shall not overwrite cpu_cc_* Looks good to me. Laurent Signed-off-by: Artyom Tarasenko atar4q...@gmail.com --- target-sparc/translate.c | 32 ++-- 1 files changed, 22 insertions(+), 10 deletions(-) diff --git a/target-sparc/translate.c b/target-sparc/translate.c index 992cd77..f32a674 100644 --- a/target-sparc/translate.c +++ b/target-sparc/translate.c @@ -727,19 +727,24 @@ static inline void gen_trap_ifdivzero_tl(TCGv divisor) static inline void gen_op_sdivx(TCGv dst, TCGv src1, TCGv src2) { int l1, l2; + TCGv r_temp1, r_temp2; l1 = gen_new_label(); l2 = gen_new_label(); - tcg_gen_mov_tl(cpu_cc_src, src1); - tcg_gen_mov_tl(cpu_cc_src2, src2); - gen_trap_ifdivzero_tl(cpu_cc_src2); - tcg_gen_brcondi_tl(TCG_COND_NE, cpu_cc_src, INT64_MIN, l1); - tcg_gen_brcondi_tl(TCG_COND_NE, cpu_cc_src2, -1, l1); + r_temp1 = tcg_temp_local_new(); + r_temp2 = tcg_temp_local_new(); + tcg_gen_mov_tl(r_temp1, src1); + tcg_gen_mov_tl(r_temp2, src2); + gen_trap_ifdivzero_tl(r_temp2); + tcg_gen_brcondi_tl(TCG_COND_NE, r_temp1, INT64_MIN, l1); + tcg_gen_brcondi_tl(TCG_COND_NE, r_temp2, -1, l1); tcg_gen_movi_i64(dst, INT64_MIN); tcg_gen_br(l2); gen_set_label(l1); - tcg_gen_div_i64(dst, cpu_cc_src, cpu_cc_src2); + tcg_gen_div_i64(dst, r_temp1, r_temp2); gen_set_label(l2); + tcg_temp_free(r_temp1); + tcg_temp_free(r_temp2); } #endif @@ -3173,10 +3178,17 @@ static void disas_sparc_insn(DisasContext * dc) break; #ifdef TARGET_SPARC64 case 0xd: /* V9 udivx */ - tcg_gen_mov_tl(cpu_cc_src, cpu_src1); - tcg_gen_mov_tl(cpu_cc_src2, cpu_src2); - gen_trap_ifdivzero_tl(cpu_cc_src2); - tcg_gen_divu_i64(cpu_dst, cpu_cc_src, cpu_cc_src2); + { + TCGv r_temp1, r_temp2; + r_temp1 = tcg_temp_local_new(); + r_temp2 = tcg_temp_local_new(); + tcg_gen_mov_tl(r_temp1, cpu_src1); + tcg_gen_mov_tl(r_temp2, cpu_src2); + gen_trap_ifdivzero_tl(r_temp2); + tcg_gen_divu_i64(cpu_dst, r_temp1, r_temp2); + tcg_temp_free(r_temp1); + tcg_temp_free(r_temp2); + } break; #endif case 0xe: /* udiv */ -- 1.7.3.4
Re: [Qemu-devel] Actual TB code doesn't look like what was intended (TCG issue)?
On Fri, Jun 24, 2011 at 4:44 AM, Max Filippov jcmvb...@gmail.com wrote: Hello guys. I'm running qemu on x86_64 host. It's clean build from git sources dated 2011.05.19, commit 1fddfba129f5435c80eda14e8bc23fdb888c7187 I have the following output from log trace,op,out_asm: Trace 0x4000a310 [d0026c92] OP: 0xd0c0 movi_i32 tmp1,$0xfff4 add_i32 tmp0,ar9,tmp1 qemu_ld32 ar1,tmp0,$0x0 0xd0c3 movi_i32 tmp1,$0xfff0 add_i32 tmp0,ar9,tmp1 qemu_ld32 ar0,tmp0,$0x0 [...snip...] [...] 0x4000a360: xor%esi,%esi 0x4000a362: callq 0x52edc2 [...] (gdb) x/25i 0x4000a330 [...] 0x4000a360: mov $0x1,%esi 0x4000a365: callq 0x52edc2 __ldl_mmu 0x4000a36a: mov %eax,%ebp 0x4000a36c: sub $0x44,%al = 0x4000a36e: lea -0x10(%rbx),%esp 0x4000a371: mov %ebp,0xc(%r14) 0x4000a375: mov %r12d,%esi 0x4000a378: mov %r12d,%edi Please note how the current instruction in gdb differ from what was said in OUT. This lea corrupts stack pointer and the next callq generates segfault. Could please anyone familiar with TCG take a look at this, or suggest where I should look myself? As Peter hinted, you're not looking at the code you think :-) Note how your original TCG code does loads: qemu_ld32 ar1,tmp0,$0x0 That $0x0 will end up in %RSI. It's the mem index used to distinguish from user and privileged level accesses. In your examples of host code, in one case it is 0 and in the other it is 1, so you're definitely not really looking at the same block in the same running conditions. HTH, Laurent
Re: [Qemu-devel] Actual TB code doesn't look like what was intended (TCG issue)?
On Fri, Jun 24, 2011 at 10:35 AM, Max Filippov jcmvb...@gmail.com wrote: [...] Yes, I've noticed it (however, after I sent this mail). But (1) quoted OUT is the last OUT for this host address range in the log and (2) in gdb I set b tlb_fill if retaddr == 0x4000a369 and made some steps. You mean that I should look at previous OUTs for this address range? That should be the last block matching the address in the log output if you run *under gdb* with -d out_asm. BTW you say this is a clean build, but as far as I could see it looks like your emulated target is not part of standard QEMU; it seems to have a register named 'ar9'. Did I miss something or is it some non public target? ia64? Laurent
Re: [Qemu-devel] [RFC][PATCH v0 0/8] Improve register allocator
On Tue, May 24, 2011 at 1:31 PM, Kirill Batuzov batuz...@ispras.ru wrote: [...] Gathered statistics shows some interesting things too. I've run matrix multiplication benchmark (guest - ARM, host - x86, linux-user mode, with my patches applied) and here are the results: spill count 3916 real spills 32 spills at bb end 1023 spills at call: globals 2755 iarg passing 0 call cloobers 106 Real spills are spills generated by register allocator when it runs out of registers. They are less than 1% of all spills. Other tests show similar behavior. When you write host x86, do you mean IA32 or x86_64? That might change the number of real spills a lot if you meant x86_64. I think any further improvements to register allocator without leveling conventions about saving globals at calls and BB ends somehow is useless. Currently we are looking if we can pass some globals on registers through basic block boundaries (inside one TB of course). If by basic block, you mean BB as implied by TCG br for instance, I'm not sure all guests will benefit a lot. If you mean that you intend on putting several guests BB in a single TB then I guess you'll have to first collect dynamic statistics before dynamically switching to grouping BB. Laurent
Re: [Qemu-devel] [PATCH 9/9] cpu-exec.c: avoid AREG0 use
On Sun, May 22, 2011 at 7:10 PM, Peter Maydell peter.mayd...@linaro.org wrote: On 22 May 2011 17:55, Blue Swirl blauwir...@gmail.com wrote: For ARM, the handcrafted instructions below need to be changed to save also r7: /* stmdb sp!, { r4 - r6, r8 - r11, lr } */ tcg_out32(s, (COND_AL 28) | 0x092d4f70); /* ldmia sp!, { r4 - r6, r8 - r11, pc } */ tcg_out32(s, (COND_AL 28) | 0x08bd8f70); That would be ...ff0 rather than ...f70 in both cases (bottom 16 bits are a bit map of registers being saved/loaded): /* stmdb sp!, { r4 - r11, lr } */ tcg_out32(s, (COND_AL 28) | 0x092d4ff0); /* ldmia sp!, { r4 - r11, pc } */ tcg_out32(s, (COND_AL 28) | 0x08bd8ff0); Shouldn't you extend the range to include r12, due to the 8-byte alignment restriction of the stack? Laurent
Re: [Qemu-devel] [PATCH 0/6] Implement constant folding and copy propagation in TCG
On Sat, May 21, 2011 at 1:31 AM, Andreas Färber andreas.faer...@web.de wrote: [...] Has anyone evaluated reusing LLVM optimization passes for TCG? Or maybe GIMPL if there's an equivalent? IMHO the qemu_ld/st semantics and the size of TB blocks will always limit the usefulness of more involved optimizations than what already exists in QEMU. Basically qemu_ld/st ops prevent any code change across them which make optimization opportunities rather low. Considering they always take the fast path would surely change the situation, but then the slow path would have to be able to rebuild information; certainly possible, but quite involved too :-) Once this qemu_ld/st change is done, you'll the need to get to the problem of too short TB blocks; a way to do this is to generate traces of frequently executed blocks and consider them as a single block[1]; that would open the door to many more code generation improvements. This certainly doesn't mean Kirill and Aurélien changes are not welcome, just that I think, in the current state, such changes will always have a limited impact. Laurent [1] See for instance section 2.3 of Derek Bruening's PhD thesis about DynamoRIO http://www.burningcutlery.com/derek/phd.html
Re: [Qemu-devel] [PATCH RFC 00/11] AREG0 elimination
On Sun, May 15, 2011 at 9:15 AM, Blue Swirl blauwir...@gmail.com wrote: On Sun, May 15, 2011 at 1:04 AM, Aurelien Jarno aurel...@aurel32.net wrote: On Sun, May 15, 2011 at 12:52:35AM +0300, Blue Swirl wrote: On Sun, May 15, 2011 at 12:16 AM, Aurelien Jarno aurel...@aurel32.net wrote: On Sat, May 14, 2011 at 10:35:20PM +0300, Blue Swirl wrote: [...] The env register is used very often (basically for every load/store, but also a lot of helpers), so it makes sense to reserve a register for it. For what I understand from your patch series, you prefer to pass this register explicitly to TCG functions. This basically means this TCG global will be loaded to host register as soon as it is used, but also regularly, as globals are saved back to their canonical location before an helper or a load/store. So it seems that this patch series will just allowing the env register to change over time, though it will not spare one more register for the TCG code, and it will emit longer TCG code to regularly reload the env global into a host register. But there will be one more register available in some cases. In other Inside the TCG code, it will basically happens very rarely, given load/store are really the most used instructions, and they need to load the env register. Not exactly, from a sample run with -d op_opt: $ egrep -v -e '^$' -v -e 'OP after' -v -e ' end' -v -e 'Search PC' /tmp/qemu.log | awk '{print $1}' | sort | uniq -c|sort -rn 1673966 movi_i32 653931 ld_i32 607432 mov_i32 428684 st_i32 326878 movi_i64 308626 add_i32 283186 call 256817 exit_tb 207232 nopn 189388 goto_tb 122398 and_i32 117997 shr_i32 89107 qemu_ld32 82926 set_label 82713 brcond_i32 67169 qemu_st32 55109 or_i32 46536 ext32u_i64 44288 xor_i32 38103 sub_i32 26361 shl_i32 23218 shl_i64 23218 qemu_st64 23218 or_i64 20474 shr_i64 20445 qemu_ld64 11161 qemu_ld8u 10409 qemu_st8 5013 qemu_ld16u 3795 qemu_st16 2776 qemu_ld8s 1915 sar_i32 1414 qemu_ld16s 839 not_i32 579 setcond_i32 213 br 42 ext32s_i64 30 mul_i64 Unless I missed something, this doesn't show the usage of ld/st per TB, which is what Aurélien was looking for if I understood correctly. All I can say is that you had at most 256817 TB's and 234507 qemu_ld/st, so about one per TB. Anyway I must be thick, because I fail to see how generated code could access guest CPU registers without a pointer to the CPU env :-) IIUC the SPARC translator uses ld_i32/st_i32 mainly for accessing the guest CPU registers, which due to register windows is held in a dedicated global temp. Is that correct? If so this is kind of hiding accesses to the CPU env; all other targets read/write registers by using CPU env (through the use global temps in most cases). So I think most (if not almost all) TB will need a pointer to CPU env, which is why I think Aurélien's proposal to keep a dedicated register that'd be loaded in the prologue is the only way to not degrade performance of the generated code (I'd add that this dedicated register should be the one defined by the ABI as holding the first parameter value, if that's possible; I'm afraid this is not necessarily a good idea). Laurent
Re: [Qemu-devel] [PATCH RFC 00/11] AREG0 elimination
On Sun, May 15, 2011 at 1:33 PM, Blue Swirl blauwir...@gmail.com wrote: [...] x86_64 uses r14 as TCG_AREG0. Despite the instructions being quite simple (only 2 movi_i32), the resulting code makes 2 access to env to save the two registers. Having to reload the env pointer each time to a register would clearly increase the size of this TB. I don't think TCG would be that simple, instead the pointer would be loaded only once in this case. Assuming TCG was able to allocate a register for that, it would be live at most for one TB, so you'd have to load it at least once per TB, and with block chaining that wouldn't be efficient as you'd keep on reloading it. Laurent
Re: [Qemu-devel] tcg/tcg.c:1892: tcg fatal error
On Thu, Apr 21, 2011 at 4:57 PM, Artyom Tarasenko atar4q...@gmail.com wrote: On Tue, Apr 12, 2011 at 4:14 AM, Igor Kovalenko igor.v.kovale...@gmail.com wrote: Do you have public test case? It is possible to code this delay slot write test but real issue may be corruption elsewhere. The test case is trivial: it's just the two instructions, branch and wrpr. In theory there could be multiple issues including compiler induced ones. I'd prefer to see some kind of reproducible testcase. Ok, attached a 40 byte long test (the first 32 bytes are not used and needed only because the bios entry point is 0x20). $ git pull make sparc64-softmmu/qemu-system-sparc64 -bios test-wrpr.bin -nographic Already up-to-date. make[1]: Nothing to be done for `all'. /mnt/terra/projects/vanilla/qemu/tcg/tcg.c:1892: tcg fatal error Aborted The problem seems to be that wrpr is using a non-local TCG tmp (cpu_tmp0). Laurent
Re: [Qemu-devel] tcg/tcg.c:1892: tcg fatal error
On Thu, Apr 21, 2011 at 9:45 PM, Igor Kovalenko igor.v.kovale...@gmail.com wrote: On Thu, Apr 21, 2011 at 7:44 PM, Laurent Desnogues laurent.desnog...@gmail.com wrote: On Thu, Apr 21, 2011 at 4:57 PM, Artyom Tarasenko atar4q...@gmail.com wrote: On Tue, Apr 12, 2011 at 4:14 AM, Igor Kovalenko igor.v.kovale...@gmail.com wrote: Do you have public test case? It is possible to code this delay slot write test but real issue may be corruption elsewhere. The test case is trivial: it's just the two instructions, branch and wrpr. In theory there could be multiple issues including compiler induced ones. I'd prefer to see some kind of reproducible testcase. Ok, attached a 40 byte long test (the first 32 bytes are not used and needed only because the bios entry point is 0x20). $ git pull make sparc64-softmmu/qemu-system-sparc64 -bios test-wrpr.bin -nographic Already up-to-date. make[1]: Nothing to be done for `all'. /mnt/terra/projects/vanilla/qemu/tcg/tcg.c:1892: tcg fatal error Aborted The problem seems to be that wrpr is using a non-local TCG tmp (cpu_tmp0). Just tried the test case with write to %pil - seems like write itself is OK. The issue appears to be with save_state() call since adding save_state to %pil case provokes the same tcg abort. The problem is that cpu_tmp0, not being a local tmp, doesn't need to be saved across helper calls. This results in the TCG optimizer getting rid of it even though it's later used. Look at the log and you'll see what I mean :-) Laurent
[Qemu-devel] Re: QEMU regression problems - Update FPU
On Thu, Feb 24, 2011 at 12:10 PM, Paolo Bonzini pbonz...@redhat.com wrote: On 02/23/2011 08:04 PM, Aurelien Jarno wrote: Actually that's the reason why i386 doesn't use softfloat, as all the trigonometric use libm, and the bridge between softfloat and libm is not working correctly (plenty of type abuse). Besides, I doubt softfloat would want bug-compatible trig function implementations. fsincos for example is a far cry from the precision of the libm function. I agree, these functions certainly don't belong to SoftFloat. They should be implemented in target-i386/op_helper.c. But bit accuracy might be difficult to achieve, unless Intel/AMD properly documented how they compute all these functions. Laurent
Re: [Qemu-devel] Re: QEMU regression problems - Update FPU
On Wed, Feb 23, 2011 at 9:16 AM, Peter Maydell peter.mayd...@linaro.org wrote: On 18 February 2011 07:12, Gerhard Wiesinger li...@wiesinger.com wrote: Issue 1.) with FPU still present I tracked down the problematic code and it is a rounding error from double precision to 64bit floats: Any ideas how to fix such an issue in general? QEMU result in ST0: 0.42925860786976457 (wrong emulated) KVM result in ST0: 0.42925860786975449 (correct) This is an error when running QEMU in TCG mode, right? At the moment x86 is the odd-one-out in that it doesn't use CONFIG_SOFTFLOAT for its FPU emulation, so somebody has made an explicit choice of preferring speed over accuracy, and I am unsurprised that there are rounding errors as a result. Even if you were using SoftFloat, you'd probably still get wrong results given that this test seems to test trigonometric instructions which aren't implemented in SoftFloat, but are relying on libm. Laurent
Re: [Qemu-devel] QEMU license problem (was [PATCH v3] Drop --whole-archive and static libraries)
On Sun, Mar 7, 2010 at 2:53 PM, Anthony Liguori anth...@codemonkey.ws wrote: [...] Removing libqemu.a was technically ok, but throws a license problem: Every file contains a copyright/license. The statement in the top level is just a statement of intention. It's to make sure that people do not introduce GPL'd code into libqemu.a. ./configure --target-list=i386-linux-user,i386-softmmu make -j7 And: find . -name 'lib*a' finds nothing, so how can you say what is supposed to be in libqemu? Laurent
Re: [Qemu-devel] forking i386 binaries on arm linux user mode
On Wed, Feb 10, 2010 at 10:38 PM, Damion Yates dam...@google.com wrote: [...] Should clone()/fork() work? Has anyone been able to run wine ./blah.exe under user-linux mode of qemu on arm or indeed any other non x86 based CPU ? I forgot to mention NPTL is not supported for x86 which will be an issue. Laurent
Re: [Qemu-devel] forking i386 binaries on arm linux user mode
On Wed, Feb 10, 2010 at 10:38 PM, Damion Yates dam...@google.com wrote: I've grabbed the latest stable qemu and compiled under scratchbox. I hit an issue compiling it, with no __builtin__clear_cache() so linked in a kludge.c containing a call to __clear_cache() with the params passed as they would be to __builtin__clear_cache(). Firstly does this sound like it should work as a workaround? That's how I did it (except that I replaced the calls to __builtin__clear_cache with calls to __clear_cache in tcg/arm/tcg-target.h and exec-all.h. It certainly got me to the next level, which is that I can now run loads of linux binaries on my armlinux system (a Nokia n900). I've tried tower toppler (http://toppler.sourceforge.net/) which uses SDL (via X11) and this was surprisingly fast, in fact it almost felt as fast as the native toppler that somebody crosscompiled already. Most linux utils work when I copy then and any dependant libs from my x86 laptop to the phone. I'm lucky (I guess) that /lib/ld-linux.so.3 is the arm version and I'm using a slightly older .2 for x86 so I can have both files there. I also enabled arbitrary execution of binaries via binfmt_misc. The 600 Mhz Arm V8 Cortex (I think it is), feels like it's running at about Pentium 90 speeds, which I'm hoping is enough for what I really want to get going. The N900 has an ARMv7 Cortex-A8 (OMAP3). P90 is about the speed I'd expect. Note it will seem much slower if your program makes heavy use of floating point. I want to run an old, possibly win16 Windows game under wine. I saw that user mode qemu-i386 was able to run wine in a post in 2004: http://lists.terrasoftsolutions.com/pipermail/yellowdog-general/2004-June/014468.html - This was on a PPC however. When I run wine it SEGVs out and the strace of it shows it dies trying to do clone(). I also can't run things like xterm which can't do fork(). Is this because by default it's trying to go via the arm /bin/sh to invoke whatever it wants to exec() in to? Should clone()/fork() work? Has anyone been able to run wine ./blah.exe under user-linux mode of qemu on arm or indeed any other non x86 based CPU ? I have seen a complex Linux i386 program run on an ARM platform (with multithreading and Qt; note it crashed at some point but it was due to the application doing access to uninitialized memory). IIRC it was using chroot from inside QEMU (in main) to make sure that all calls to exec would happen on an x86 file system. You might give that a try. Laurent
Re: [Qemu-devel] [PATCH 4/4] target-arm: neon fix
On Mon, Feb 8, 2010 at 12:47 PM, Riku Voipio riku.voi...@iki.fi wrote: On Sun, Feb 07, 2010 at 02:02:31PM +0100, Laurent Desnogues wrote: On Sun, Feb 7, 2010 at 1:54 PM, Laurent Desnogues laurent.desnog...@gmail.com wrote: On Fri, Feb 5, 2010 at 4:52 PM, Riku Voipio riku.voi...@iki.fi wrote: From: Juha Riihimäki juha.riihim...@nokia.com add an extra check in two registers and a shift to ensure element size decoding logic cannot fail. I think there's a patch ordering problem that makes the comment and the change not agree :-) Sorry, apparently messed up while rebasing. BTW I don't think adding the check for size is needed here. The encoding at that point looks like this: 332211 10987654321098765432109876543210 001_1___1__1 001_1__1___1 001_1_11 so it will stop for size == 0 given that bit 19 will have to be set. Juha agrees so we'll drop this patch (or more precisely get the actual change out of the previous patch..) Do you intend to resend the patch 3 of this set or should it be reviewed as is? Thanks, Laurent
Re: [Qemu-devel] [PATCH] User mode: Handle x86_64 vsyscall
On Sun, Feb 7, 2010 at 1:22 AM, Jamie Lokier ja...@shareable.org wrote: [...] How would you achieve that? Your guest OS doesn't necessarily have the code mapped. I think this has to be considered as other syscalls, though slightly different. There is no guest OS when doing -user emulation. Only qemu. I meant that the vsyscall page doesn't exist on other guest systems but x86_64 running Linux. So if one wants to have it somehow mapped then it would have to be installed by QEMU, and QEMU can't install such a page due to limitations in the way it handles virtual addresses. My favorite solution would be a vsyscall page mapped to the correct fixed address and filled with QEMU generated specific code, for example code which calls the normal syscalls to do the work. This would only need modifications for linux-user code. You mean you'd explicitly put somewhere x86_64 code that simulates the behaviour of vsyscall? That seems like a good idea to me. Why not indeed. But someone will first have to fix virtual memory management. Laurent
Re: [Qemu-devel] [PATCH 1/4] target-arm: neon - fix VRADDHN/VRSUBHN vs VADDHN/VSUBHN
On Fri, Feb 5, 2010 at 4:52 PM, Riku Voipio riku.voi...@iki.fi wrote: From: Riku Voipio riku.voi...@nokia.com The rounding/truncating options were inverted. truncating was done when rounding was meant and vice verse. Signed-off-by: Riku Voipio riku.voi...@nokia.com Acked-by: Laurent Desnogues laurent.desnog...@gmail.com Laurent --- target-arm/translate.c | 4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/target-arm/translate.c b/target-arm/translate.c index 5cf3e06..4bd813a 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -4957,7 +4957,7 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) case 0: case 1: case 4: /* VADDL, VADDW, VADDHN, VRADDHN */ gen_neon_addl(size); break; - case 2: case 3: case 6: /* VSUBL, VSUBW, VSUBHL, VRSUBHL */ + case 2: case 3: case 6: /* VSUBL, VSUBW, VSUBHN, VRSUBHN */ gen_neon_subl(size); break; case 5: case 7: /* VABAL, VABDL */ @@ -5026,7 +5026,7 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) } else if (op == 4 || op == 6) { /* Narrowing operation. */ tmp = new_tmp(); - if (u) { + if (!u) { switch (size) { case 0: gen_helper_neon_narrow_high_u8(tmp, cpu_V0); -- 1.6.5
Re: [Qemu-devel] [PATCH 2/4] target-arm: neon vshll instruction fix
On Fri, Feb 5, 2010 at 4:52 PM, Riku Voipio riku.voi...@iki.fi wrote: From: Juha Riihimäki juha.riihim...@nokia.com implementation only widened the 32bit source vector elements into a 64bit destination vector but forgot to perform the actual shifting operation. Signed-off-by: Juha Riihimäki juha.riihim...@nokia.com Signed-off-by: Riku Voipio riku.voi...@nokia.com Acked-by: Laurent Desnogues laurent.desnog...@gmail.com Laurent --- target-arm/translate.c | 1 + 1 files changed, 1 insertions(+), 0 deletions(-) diff --git a/target-arm/translate.c b/target-arm/translate.c index 4bd813a..537d9d6 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -5385,6 +5385,7 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) if (pass == 1) tmp = tmp2; gen_neon_widen(cpu_V0, tmp, size, 1); + tcg_gen_shli_i64(cpu_V0, cpu_V0, 8 size); neon_store_reg64(cpu_V0, rd + pass); } break; -- 1.6.5
Re: [Qemu-devel] [PATCH 4/4] target-arm: neon fix
On Fri, Feb 5, 2010 at 4:52 PM, Riku Voipio riku.voi...@iki.fi wrote: From: Juha Riihimäki juha.riihim...@nokia.com add an extra check in two registers and a shift to ensure element size decoding logic cannot fail. Signed-off-by: Juha Riihimäki juha.riihim...@nokia.com Signed-off-by: Riku Voipio riku.voi...@nokia.com --- target-arm/translate.c | 3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/target-arm/translate.c b/target-arm/translate.c index 743b846..8bba034 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -4567,8 +4567,9 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) size = 3; } else { size = 2; - while (size (insn (1 (size + 19))) == 0) + while (size (insn (1 (size + 19))) == 0) { size--; + } } shift = (insn 16) ((1 (3 + size)) - 1); /* To avoid excessive dumplication of ops we implement shift I think there's a patch ordering problem that makes the comment and the change not agree :-) Laurent
Re: [Qemu-devel] [PATCH 4/4] target-arm: neon fix
On Sun, Feb 7, 2010 at 1:54 PM, Laurent Desnogues laurent.desnog...@gmail.com wrote: On Fri, Feb 5, 2010 at 4:52 PM, Riku Voipio riku.voi...@iki.fi wrote: From: Juha Riihimäki juha.riihim...@nokia.com add an extra check in two registers and a shift to ensure element size decoding logic cannot fail. Signed-off-by: Juha Riihimäki juha.riihim...@nokia.com Signed-off-by: Riku Voipio riku.voi...@nokia.com --- target-arm/translate.c | 3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/target-arm/translate.c b/target-arm/translate.c index 743b846..8bba034 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -4567,8 +4567,9 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) size = 3; } else { size = 2; - while (size (insn (1 (size + 19))) == 0) + while (size (insn (1 (size + 19))) == 0) { size--; + } } shift = (insn 16) ((1 (3 + size)) - 1); /* To avoid excessive dumplication of ops we implement shift I think there's a patch ordering problem that makes the comment and the change not agree :-) BTW I don't think adding the check for size is needed here. The encoding at that point looks like this: 332211 10987654321098765432109876543210 001_1___1__1 001_1__1___1 001_1_11 so it will stop for size == 0 given that bit 19 will have to be set. Laurent
Re: [Qemu-devel] [PATCH] User mode: Handle x86_64 vsyscall
On Sat, Feb 6, 2010 at 8:49 AM, Stefan Weil w...@mail.berlios.de wrote: [...] I tested two different hosts with x86_64-linux-user: * 32 bit Intel (i386) - does not work with your patch For me x86_64 on i386 has always failed without even calling vsyscall :-) * 64 bit AMD (x86_64) - works with your patch Your patch improves the emulation for 64 bit hosts. Nevertheless, it has some open points: * target-i386 code should not have to know about linux vsyscall Given that we have to workaround 64-bit virtual address limitations (cf. Richard mail and previous discussions on the list), doing otherwise looks difficult. * there is no vsyscall page in memory, but very special programs might expect to see one (it is even worse: the target sees the memory page of the host) * it is not possible to step into vsyscall code using a debugger How would you achieve that? Your guest OS doesn't necessarily have the code mapped. I think this has to be considered as other syscalls, though slightly different. My favorite solution would be a vsyscall page mapped to the correct fixed address and filled with QEMU generated specific code, for example code which calls the normal syscalls to do the work. This would only need modifications for linux-user code. You mean you'd explicitly put somewhere x86_64 code that simulates the behaviour of vsyscall? Laurent
Re: [Qemu-devel] [PATCH] User mode: Handle x86_64 vsyscall
On Fri, Feb 5, 2010 at 11:57 PM, Stefan Weil w...@mail.berlios.de wrote: Laurent Desnogues schrieb: [...] I'm still struggling with bntest and other x86_64-linux-user software calling any of the vsyscall functions. Laurent, your vsyscall patch only works on x86_64 hosts. A lot of software calls time() which uses vsyscall on x86_64 which does not work with x86_64-linux-user mode. I'm not sure I understand what you mean. Did you try on some other host and it failed? Was your host 32-bit? If so, I'm afraid user-mode will fail for more reasons than vsyscall. So the status of x86_64-linux-user is not more than experimental :-( I tried to modify x86_64-linux-user to set up a vsyscall page in high memory, but this seems to be difficult (at least with 32 bit host). Any hints how this should be done are welcome. My patch explicitly prevents the linking of the vsyscall page. Could you provide more info about your host? Laurent
Re: [Qemu-devel] Adding support for MIPS64 as host
On Thu, Feb 4, 2010 at 6:26 PM, Utkarsh Sopan utkarsh.so...@coe.dce.edu wrote: Can you tell me what is the status of adding MIPS64 support? as at the wiki page it showed Red earlier. Look here: http://git.aurel32.net/?p=qemu.git;a=summary Laurent
Re: [Qemu-devel] Need help to run application on QEMU
On Wed, Feb 3, 2010 at 11:10 AM, Taimoor Mirza mooni_mi...@hotmail.com wrote: Hi all, I have been trying to use VGA card and LCD for integrator 926. I've built my kernel PLUS with graphics support. I used following command to run my PLUS application on qemu: qemu-system-arm -M integratorcp -cpu arm926 -kernel ./graphics_demo.out I am getting following error: qemu: fatal: integratorcm_read: Unimplemented offset 0x54 R00=1000 R01= R02= R03=1000 R04=0014 R05=0400 R06=0038e054 R07= R08= R09=003b06ec R10= R11= R12=00390814 R13=003b06cc R14=00086148 R15=00085eec PSR=801f N--- A sys32 s00=( 0) s01=( 0) d00=( 0) s02=( 0) s03=( 0) d01=( 0) s04=( 0) s05=( 0) d02=( 0) s06=( 0) s07=( 0) d03=( 0) s08=( 0) s09=( 0) d04=( 0) s10=( 0) s11=( 0) d05=( 0) s12=( 0) s13=( 0) d06=( 0) s14=( 0) s15=( 0) d07=( 0) s16=( 0) s17=( 0) d08=( 0) s18=( 0) s19=( 0) d09=( 0) s20=( 0) s21=( 0) d10=( 0) s22=( 0) s23=( 0) d11=( 0) s24=( 0) s25=( 0) d12=( 0) s26=( 0) s27=( 0) d13=( 0) s28=( 0) s29=( 0) d14=( 0) s30=( 0) s31=( 0) d15=( 0) FPSCR: This application has requested the Runtime to terminate it in an unusual way. Please contact the application's support team for more information. Can anyone help me? Your software looks faulty the core module register at offset 0x54 is write only, cf. http://infocenter.arm.com/help/topic/com.arm.doc.dui0138e/DUI0138E_CMxx6_UserGuide.pdf Table 4-3 page 4-14. Laurent
Re: [Qemu-devel] Adding support for MIPS64 as host
On Wed, Feb 3, 2010 at 5:53 PM, Utkarsh Sopan utkarsh.so...@coe.dce.edu wrote: I am trying to add support for MIPS 64 as Host machine as my academic project. Note that some work has already been done for MIPS host support. I am new to QEMU. Problem is I dont have a MIPS 64 machine to test on. Please suggest whether or not I can use a nested QEMU emulator to test the same. i.e. running my version on top of a simulated MIPS64 machine. Is it possible to do so? I was able to run on an x86_64 host, an ARM-hosted QEMU simulating x86. That was for Linux user mode. I can't say if that would work for other targets or for system simulation. Laurent
Re: [Qemu-devel] [BUG] qemu-x86_64 crash when running bntest (was: [PATCH] Porting TCG to alpha platform)
On Sat, Jan 30, 2010 at 10:30 AM, Stefan Weil w...@mail.berlios.de wrote: Laurent Desnogues schrieb: On Sat, Jan 30, 2010 at 12:04 AM, Stefan Weil w...@mail.berlios.de wrote: [...] that was a good suggestion. bntest raises a segmentation fault (NULL pointer, p == 0, see below) with qemu-x86_64 on a x86_64 host. Compile bntest statically and it should work. x86_64 user mode is completely broken for dynamically linked programs. Laurent A statically linked bntest results in a crash, too. Stefan $ ldd bntest not a dynamic executable $ x86/x86_64-linux-user/qemu-x86_64 ./bntest ERROR: ioctl(SNDCTL_DSP_MAPINBUF): target=0x80085013 host=0x80105013 ERROR: ioctl(SNDCTL_DSP_MAPOUTBUF): target=0x80085014 host=0x80105014 obase=16 ibase=16 test BN_add print test BN_add\n qemu: uncaught target signal 11 (Segmentation fault) - core dumped That worked for me. Could show us the last translated TB? Laurent
Re: [Qemu-devel] [BUG] qemu-x86_64 crash when running bntest (was: [PATCH] Porting TCG to alpha platform)
On Sat, Jan 30, 2010 at 10:30 AM, Stefan Weil w...@mail.berlios.de wrote: Laurent Desnogues schrieb: On Sat, Jan 30, 2010 at 12:04 AM, Stefan Weil w...@mail.berlios.de wrote: [...] that was a good suggestion. bntest raises a segmentation fault (NULL pointer, p == 0, see below) with qemu-x86_64 on a x86_64 host. Compile bntest statically and it should work. x86_64 user mode is completely broken for dynamically linked programs. Laurent A statically linked bntest results in a crash, too. OK, I found the issue. It's vsyscall. You should use the patch I sent to this list a few months ago. Laurent
Re: [Qemu-devel] [PATCH] Porting TCG to alpha platform
On Sat, Jan 30, 2010 at 12:04 AM, Stefan Weil w...@mail.berlios.de wrote: [...] that was a good suggestion. bntest raises a segmentation fault (NULL pointer, p == 0, see below) with qemu-x86_64 on a x86_64 host. Compile bntest statically and it should work. x86_64 user mode is completely broken for dynamically linked programs. Laurent
Re: [Qemu-devel] Merge qemu android
On Thu, Jan 28, 2010 at 11:44 AM, Bastien ROUCARIES roucaries.bast...@gmail.com wrote: Forget to cc On Sat, Jan 23, 2010 at 11:40 PM, Anthony Liguori anth...@codemonkey.ws wrote: On 01/21/2010 10:27 AM, Bastien ROUCARIES wrote: Hi, What is the step in order to get qemu android merged mainline ? http://android.git.kernel.org/?p=platform/external/qemu.git;a=summary [...] I know but the code base is really different they use 0.8.2 as a base, and i was thinking of porting to upstream. Really? The tree you point to above has TCG. Or is it unused? They use also craps like sdl :S Mainline QEMU does too. Laurent
Re: [Qemu-devel] [PATCH 4/4] target-arm: refactor cp15.c13 register access
On Wed, Jan 27, 2010 at 1:49 PM, Riku Voipio riku.voi...@iki.fi wrote: From: Riku Voipio riku.voi...@nokia.com Access the cp15.c13 TLS registers directly with TCG ops instead of with a slow helper. If the the cp15 read/write was not TLS register access, fall back to the cp15 helper. This makes accessing __thread variables in linux-user when apps are compiled with -mtp=cp15 possible. legal cp15 register to acces from linux-user are already checked in cp15_user_ok. While at it, make the cp15.c13 Thread ID registers available only on ARMv6K and newer. Signed-off-by: Riku Voipio riku.voi...@nokia.com Acked-by: Laurent Desnogues laurent.desnog...@gmail.com Laurent --- target-arm/helper.c | 16 -- target-arm/translate.c | 55 2 files changed, 55 insertions(+), 16 deletions(-) diff --git a/target-arm/helper.c b/target-arm/helper.c index b3aec99..27001e8 100644 --- a/target-arm/helper.c +++ b/target-arm/helper.c @@ -511,7 +511,6 @@ void HELPER(set_cp15)(CPUState *env, uint32_t insn, uint32_t val) uint32_t HELPER(get_cp15)(CPUState *env, uint32_t insn) { cpu_abort(env, cp15 insn %08x\n, insn); - return 0; } /* These should probably raise undefined insn exceptions. */ @@ -1491,15 +1490,6 @@ void HELPER(set_cp15)(CPUState *env, uint32_t insn, uint32_t val) tlb_flush(env, 0); env-cp15.c13_context = val; break; - case 2: - env-cp15.c13_tls1 = val; - break; - case 3: - env-cp15.c13_tls2 = val; - break; - case 4: - env-cp15.c13_tls3 = val; - break; default: goto bad_reg; } @@ -1779,12 +1769,6 @@ uint32_t HELPER(get_cp15)(CPUState *env, uint32_t insn) return env-cp15.c13_fcse; case 1: return env-cp15.c13_context; - case 2: - return env-cp15.c13_tls1; - case 3: - return env-cp15.c13_tls2; - case 4: - return env-cp15.c13_tls3; default: goto bad_reg; } diff --git a/target-arm/translate.c b/target-arm/translate.c index 5cf3e06..786c329 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -2455,6 +2455,57 @@ static int cp15_user_ok(uint32_t insn) return 0; } +static int cp15_tls_load_store(CPUState *env, DisasContext *s, uint32_t insn, uint32_t rd) +{ + TCGv tmp; + int cpn = (insn 16) 0xf; + int cpm = insn 0xf; + int op = ((insn 5) 7) | ((insn 18) 0x38); + + if (!arm_feature(env, ARM_FEATURE_V6K)) + return 0; + + if (!(cpn == 13 cpm == 0)) + return 0; + + if (insn ARM_CP_RW_BIT) { + tmp = new_tmp(); + switch (op) { + case 2: + tcg_gen_ld_i32(tmp, cpu_env, offsetof(CPUARMState, cp15.c13_tls1)); + break; + case 3: + tcg_gen_ld_i32(tmp, cpu_env, offsetof(CPUARMState, cp15.c13_tls2)); + break; + case 4: + tcg_gen_ld_i32(tmp, cpu_env, offsetof(CPUARMState, cp15.c13_tls3)); + break; + default: + dead_tmp(tmp); + return 0; + } + store_reg(s, rd, tmp); + + } else { + tmp = load_reg(s, rd); + switch (op) { + case 2: + tcg_gen_st_i32(tmp, cpu_env, offsetof(CPUARMState, cp15.c13_tls1)); + break; + case 3: + tcg_gen_st_i32(tmp, cpu_env, offsetof(CPUARMState, cp15.c13_tls2)); + break; + case 4: + tcg_gen_st_i32(tmp, cpu_env, offsetof(CPUARMState, cp15.c13_tls3)); + break; + default: + return 0; + } + dead_tmp(tmp); + } + return 1; +} + /* Disassemble system coprocessor (cp15) instruction. Return nonzero if instruction is not defined. */ static int disas_cp15_insn(CPUState *env, DisasContext *s, uint32_t insn) @@ -2489,6 +2540,10 @@ static int disas_cp15_insn(CPUState *env, DisasContext *s, uint32_t insn) return 0; } rd = (insn 12) 0xf; + + if (cp15_tls_load_store(env, s, insn, rd)) + return 0; + tmp2 = tcg_const_i32(insn); if (insn ARM_CP_RW_BIT) { tmp = new_tmp(); -- 1.6.5
Re: [Qemu-devel] [PATCH 4/5] linux-user: Add access to TLS registers
On Tue, Jan 26, 2010 at 5:00 PM, Riku Voipio riku.voi...@iki.fi wrote: From: Riku Voipio riku.voi...@nokia.com If you compile applications with gcc -mtp=cp15, __thread access's will generate an abort. Implement accessing allowed cp15.c13 registers on ARMv6K+ in linux-user. Signed-off-by: Riku Voipio riku.voi...@nokia.com --- target-arm/helper.c | 27 ++- 1 files changed, 26 insertions(+), 1 deletions(-) diff --git a/target-arm/helper.c b/target-arm/helper.c index b3aec99..68578ce 100644 --- a/target-arm/helper.c +++ b/target-arm/helper.c @@ -505,13 +505,38 @@ uint32_t HELPER(get_cp)(CPUState *env, uint32_t insn) void HELPER(set_cp15)(CPUState *env, uint32_t insn, uint32_t val) { + int op2; + + op2 = (insn 5) 7; + /* Allow write access to CP15 User RW Thread ID Register */ + if (arm_feature (env, ARM_FEATURE_V6K) ((insn 16) 0xf) == 13) { + switch (op2) { + case 2: + env-cp15.c13_tls1 = val; + return; + } + } cpu_abort(env, cp15 insn %08x\n, insn); } uint32_t HELPER(get_cp15)(CPUState *env, uint32_t insn) { + int op2; + /* Allow read access to CP15 User RW and RO Thread ID Registers */ + + op2 = (insn 5) 7; + if (arm_feature (env, ARM_FEATURE_V6K) ((insn 16) 0xf) == 13) { + switch (op2) { + case 2: + return env-cp15.c13_tls1; + case 3: + return env-cp15.c13_tls2; + default: + goto bad_reg; + } + } +bad_reg: cpu_abort(env, cp15 insn %08x\n, insn); - return 0; } /* These should probably raise undefined insn exceptions. */ Most of the checks you do here could be done in translate.c. Wouldn't it be better to do them there? Laurent