[PATCH] test long JMP emulation
Goes on top of Jcc tests patch. Signed-off-by: Gleb Natapov g...@redhat.com diff --git a/user/test/x86/realmode.c b/user/test/x86/realmode.c index 336ba1c..755b5d1 100644 --- a/user/test/x86/realmode.c +++ b/user/test/x86/realmode.c @@ -451,6 +451,23 @@ void test_jcc_near(void) print_serial(JMP near Test 1: FAIL\n); } +void test_long_jmp() +{ + struct regs inregs = { 0 }, outregs; + u32 esp[16]; + + inregs.esp = (u32)esp; + MK_INSN(long_jmp, call 1f\n\t + jmp 2f\n\t + 1: jmp $0, $test_function\n\t + 2:\n\t); + exec_in_big_real_mode(inregs, outregs, + insn_long_jmp, + insn_long_jmp_end - insn_long_jmp); + if(!regs_equal(inregs, outregs, R_AX) || outregs.eax != 0x1234) + print_serial(Long JMP Test: FAIL\n); +} + void test_null(void) { struct regs inregs = { 0 }, outregs; @@ -473,6 +490,8 @@ void start(void) test_jcc_near(); /* test_call() uses short jump so call it after testing jcc */ test_call(); + /* long jmp test uses call near so test it after testing call */ + test_long_jmp(); exit(0); } -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/3] Add new mode of instruction emulation: skip.
In the new mode instruction is decoded, but not executed. The EIP is moved to point after the instruction. Signed-off-by: Gleb Natapov g...@redhat.com --- arch/x86/include/asm/kvm_host.h |1 + arch/x86/kvm/x86.c |5 + 2 files changed, 6 insertions(+), 0 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3fc4623..e672ca5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -565,6 +565,7 @@ enum emulation_result { #define EMULTYPE_NO_DECODE (1 0) #define EMULTYPE_TRAP_UD (1 1) +#define EMULTYPE_SKIP (1 2) int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, unsigned long cr2, u16 error_code, int emulation_type); void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1d9a312..32c7b8f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2411,6 +2411,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu, } } + if (emulation_type EMULTYPE_SKIP) { + kvm_rip_write(vcpu, vcpu-arch.emulate_ctxt.decode.eip); + return EMULATE_DONE; + } + r = x86_emulate_insn(vcpu-arch.emulate_ctxt, emulate_ops); if (vcpu-arch.pio.string) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/3] Completely decode instruction in decoding stage.
After instruction decoding decode_cache.eip should point after instruction. Signed-off-by: Gleb Natapov g...@redhat.com --- arch/x86/kvm/x86_emulate.c | 107 1 files changed, 38 insertions(+), 69 deletions(-) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index d7c9f6f..0aef8bc 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -76,6 +76,7 @@ #define Src2CL (129) #define Src2ImmByte (229) #define Src2One (329) +#define Src2Imm16 (429) #define Src2Mask(729) enum { @@ -135,8 +136,10 @@ static u32 opcode_table[256] = { SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ /* 0x70 - 0x77 */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps | SrcImmByte, ImplicitOps | SrcImmByte, + ImplicitOps | SrcImmByte, ImplicitOps | SrcImmByte, + ImplicitOps | SrcImmByte, ImplicitOps | SrcImmByte, + ImplicitOps | SrcImmByte, ImplicitOps | SrcImmByte, /* 0x78 - 0x7F */ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, @@ -153,7 +156,8 @@ static u32 opcode_table[256] = { /* 0x90 - 0x97 */ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, /* 0x98 - 0x9F */ - 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, + 0, 0, SrcImm | Src2Imm16, 0, ImplicitOps | Stack, ImplicitOps | Stack, + 0, 0, /* 0xA0 - 0xA7 */ ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, @@ -178,7 +182,8 @@ static u32 opcode_table[256] = { 0, ImplicitOps | Stack, 0, 0, ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, /* 0xC8 - 0xCF */ - 0, 0, 0, ImplicitOps | Stack, 0, 0, 0, 0, + 0, 0, 0, ImplicitOps | Stack, ImplicitOps, SrcImmByte, ImplicitOps, + ImplicitOps, /* 0xD0 - 0xD7 */ ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, @@ -187,11 +192,13 @@ static u32 opcode_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, /* 0xE0 - 0xE7 */ 0, 0, 0, 0, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, + SrcNone | ByteOp | SrcImmByte | ImplicitOps, + SrcNone | SrcImmByte | ImplicitOps, + SrcNone | ByteOp | SrcImmByte | ImplicitOps, + SrcNone | SrcImmByte | ImplicitOps, /* 0xE8 - 0xEF */ - ImplicitOps | Stack, SrcImm | ImplicitOps, - ImplicitOps, SrcImmByte | ImplicitOps, + SrcImm | ImplicitOps | Stack, SrcImm | ImplicitOps, + SrcImm | Src2Imm16 | ImplicitOps, SrcImmByte | ImplicitOps, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* 0xF0 - 0xF7 */ @@ -230,10 +237,12 @@ static u32 twobyte_table[256] = { /* 0x70 - 0x7F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 - 0x8F */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + SrcImm | ImplicitOps, SrcImm | ImplicitOps, SrcImm | ImplicitOps, + SrcImm | ImplicitOps, SrcImm | ImplicitOps, SrcImm | ImplicitOps, + SrcImm | ImplicitOps, SrcImm | ImplicitOps, SrcImm | ImplicitOps, + SrcImm | ImplicitOps, SrcImm | ImplicitOps, SrcImm | ImplicitOps, + SrcImm | ImplicitOps, SrcImm | ImplicitOps, SrcImm | ImplicitOps, + SrcImm | ImplicitOps, /* 0x90 - 0x9F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0 - 0xA7 */ @@ -1072,6 +1081,12 @@ done_prefixes: c-src2.bytes = 1; c-src2.val = insn_fetch(u8, 1, c-eip); break; + case Src2Imm16: + c-src2.type = OP_IMM; + c-src2.ptr = (unsigned long *)c-eip; + c-src2.bytes = 2; + c-src2.val = insn_fetch(u16, 2, c-eip); + break; case Src2One: c-src2.bytes = 1; c-src2.val = 1; @@ -1531,13 +1546,10 @@ special_insn: return -1; } return 0; - case 0x70 ... 0x7f: /* jcc (short) */ { - int rel = insn_fetch(s8, 1, c-eip); - + case 0x70 ... 0x7f: /* jcc (short) */ if (test_cc(c-b, ctxt-eflags)) - jmp_rel(c, rel); +
[PATCH 3/3] [AMD] Skip instruction on a task switch only when appropriate.
If a task switch was initiated because off a task gate in IDT and IDT was accessed because of an external even the instruction should not be skipped. Signed-off-by: Gleb Natapov g...@redhat.com --- arch/x86/kvm/svm.c | 11 +-- 1 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 3ffb695..053f3c5 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1826,6 +1826,7 @@ static int task_switch_interception(struct vcpu_svm *svm, int reason; int int_type = svm-vmcb-control.exit_int_info SVM_EXITINTINFO_TYPE_MASK; + int int_vec = svm-vmcb-control.exit_int_info SVM_EVTINJ_VEC_MASK; tss_selector = (u16)svm-vmcb-control.exit_info_1; @@ -1841,8 +1842,14 @@ static int task_switch_interception(struct vcpu_svm *svm, reason = TASK_SWITCH_CALL; - if (reason != TASK_SWITCH_GATE || int_type == SVM_EXITINTINFO_TYPE_SOFT) - skip_emulated_instruction(svm-vcpu); + if (reason != TASK_SWITCH_GATE || + int_type == SVM_EXITINTINFO_TYPE_SOFT || + (int_type == SVM_EXITINTINFO_TYPE_EXEPT +(int_vec == OF_VECTOR || int_vec == BP_VECTOR))) { + if (emulate_instruction(svm-vcpu, kvm_run, 0, 0, + EMULTYPE_SKIP) != EMULATE_DONE) + return 0; + } return kvm_task_switch(svm-vcpu, tss_selector, reason); } -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 00/17] virtual-bus
Rusty Russell wrote: On Thursday 02 April 2009 02:40:29 Anthony Liguori wrote: Rusty Russell wrote: As you point out, 350-450 is possible, which is still bad, and it's at least partially caused by the exit to userspace and two system calls. If virtio_net had a backend in the kernel, we'd be able to compare numbers properly. I doubt the userspace exit is the problem. On a modern system, it takes about 1us to do a light-weight exit and about 2us to do a heavy-weight exit. A transition to userspace is only about ~150ns, the bulk of the additional heavy-weight exit cost is from vcpu_put() within KVM. Just to inject some facts, servicing a ping via tap (ie host-guest then guest-host response) takes 26 system calls from one qemu thread, 7 from another (see strace below). Judging by those futex calls, multiple context switches, too. Interesting stuff. Even if amortized over half a ring's worth of packets, that's quite a lot. Two threads are involved (we complete on the iothread, since we don't know which vcpu will end up processing the interrupt, if any). Pid 10260: 12:37:40.245785 select(17, [4 6 8 14 16], [], [], {0, 996000}) = 1 (in [6], left {0, 992000}) 0.003995 Should switch to epoll with its lower wait costs. Unfortunately the relative timeout requires reading the clock. 12:37:40.250226 read(6, \0\0\0\0\0\0\0\0\0\0RT\0\0224V*\211\24\210`\304\10\0E\0..., 69632) = 108 0.51 12:37:40.250462 write(1, tap read: 108 bytes\n, 20) = 20 0.000197 I hope this is your addition. 12:37:40.250800 ioctl(7, 0x4008ae61, 0x7fff8cafb3a0) = 0 0.000223 12:37:40.251149 read(6, 0x115c6ac, 69632) = -1 EAGAIN (Resource temporarily unavailable) 0.19 This wouldn't be necessary with io_getevents(). 12:37:40.251292 write(1, tap read: -1 bytes\n, 19) = 19 0.85 ... 12:37:40.251488 clock_gettime(CLOCK_MONOTONIC, {1554, 633304282}) = 0 0.20 12:37:40.251604 clock_gettime(CLOCK_MONOTONIC, {1554, 633413793}) = 0 0.19 Great. 12:37:40.251717 futex(0xb81360, 0x81 /* FUTEX_??? */, 1) = 1 0.001222 12:37:40.253037 select(17, [4 6 8 14 16], [], [], {1, 0}) = 1 (in [16], left {1, 0}) 0.26 12:37:40.253196 read(16, \16\0\0\0\0\0\0\0\376\377\377\377\0\0\0\0\0\0\0\0\0\0\0..., 128) = 128 0.22 12:37:40.253324 rt_sigaction(SIGALRM, NULL, {0x406d50, ~[KILL STOP RTMIN RT_1], SA_RESTORER, 0x7f1a842430f0}, 8) = 0 0.18 12:37:40.253477 write(5, \0, 1) = 1 0.22 The write is to wake someone up. Who? 12:37:40.253585 read(16, 0x7fff8cb09440, 128) = -1 EAGAIN (Resource temporarily unavailable) 0.20 Clearing up signalfd... 12:37:40.253687 clock_gettime(CLOCK_MONOTONIC, {1554, 635496181}) = 0 0.19 12:37:40.253798 writev(6, [{\0\0\0\0\0\0\0\0\0\0, 10}, {*\211\24\210`\304rt\0\0224v\10\0e\0\0t\255\262\...@\1g..., 98}], 2) = 108 0.62 12:37:40.253993 ioctl(7, 0x4008ae61, 0x7fff8caff460) = 0 0.000161 Injecting the interrupt. 12:37:40.254263 clock_gettime(CLOCK_MONOTONIC, {1554, 636077540}) = 0 0.19 12:37:40.254380 futex(0xb81360, 0x81 /* FUTEX_??? */, 1) = 1 0.000394 12:37:40.254861 select(17, [4 6 8 14 16], [], [], {1, 0}) = 1 (in [4], left {1, 0}) 0.22 12:37:40.255001 read(4, \0, 512) = 1 0.21 Great, the write() was to wake ourselves up. 12:37:40.255109 read(4, 0x7fff8cb092d0, 512) = -1 EAGAIN (Resource temporarily unavailable) 0.18 12:37:40.255211 clock_gettime(CLOCK_MONOTONIC, {1554, 637020677}) = 0 0.19 12:37:40.255314 clock_gettime(CLOCK_MONOTONIC, {1554, 637123483}) = 0 0.19 12:37:40.255416 timer_gettime(0, {it_interval={0, 0}, it_value={0, 0}}) = 0 0.18 12:37:40.255524 timer_settime(0, 0, {it_interval={0, 0}, it_value={0, 1400}}, NULL) = 0 0.21 12:37:40.255635 clock_gettime(CLOCK_MONOTONIC, {1554, 637443915}) = 0 0.19 12:37:40.255739 clock_gettime(CLOCK_MONOTONIC, {1554, 637547001}) = 0 0.18 12:37:40.255847 select(17, [4 6 8 14 16], [], [], {1, 0}) = 1 (in [16], left {0, 988000}) 0.014303 This is the vcpu thread: Pid 10262: 12:37:40.252531 clock_gettime(CLOCK_MONOTONIC, {1554, 634339051}) = 0 0.18 12:37:40.252631 timer_gettime(0, {it_interval={0, 0}, it_value={0, 17549811}}) = 0 0.21 12:37:40.252750 timer_settime(0, 0, {it_interval={0, 0}, it_value={0, 25}}, NULL) = 0 0.24 12:37:40.252868 ioctl(11, 0xae80, 0)= 0 0.001171 12:37:40.254128 futex(0xb81360, 0x80 /* FUTEX_??? */, 2) = 0 0.000270 12:37:40.254490 ioctl(7, 0x4008ae61, 0x4134bee0) = 0 0.19 12:37:40.254598 futex(0xb81360, 0x81 /* FUTEX_??? */, 1) = 0 0.17 12:37:40.254693 ioctl(11, 0xae80 unfinished ... Looks like the interrupt from the iothread was injected and delivered before the iothread could give up the mutex, so we needed to wait here. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to
Re: cr3 OOS optimisation breaks 32-bit GNU/kFreeBSD guest
Marcelo Tosatti wrote: The problem is when the page is unreachable due to a higher level path being unlinked. Say: level 4 - level 3 . level 2 - level 1 (global unsync) The dot there means level 3 is not linked to level 2, so invlpg can't reach the global unsync at level 1. kvm_mmu_get_page does sync pages when it finds them, so the code is already safe for the linking a page which contains global ptes case you mention above. The recursive resync ignores global pages (or it would hit them on cr3 switch too). But we have a bigger problem, invlpg can miss even if nothing is unlinked: access address x through global pte - pte instantiated into spte switch to cr3 where x is not mapped, or mapped differently write to pte - no fault since pte is unsync invlpg x - no way this can hit the spte switch cr3 back access address x - use old spte Here's one way to make this work: - add a hash of global pagetables, indexed by virtual address instead of the pagetable's gfn - invlpg checks this hash in addition to the recursive walk We'd need to make the virtual address part of sp-role to avoid needing to link the same page multiple times in the virtual address hash. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[ kvm-Bugs-2733678 ] network problems
Bugs item #2733678, was opened at 2009-04-05 10:51 Message generated for change (Tracker Item Submitted) made by lacsap2 You can respond by visiting: https://sourceforge.net/tracker/?func=detailatid=893831aid=2733678group_id=180599 Please note that this message will contain a full copy of the comment thread, including the initial issue submission, for this request, not just the latest update. Category: intel Group: None Status: Open Resolution: None Priority: 5 Private: No Submitted By: Lacsap Lacsap (lacsap2) Assigned to: Nobody/Anonymous (nobody) Summary: network problems Initial Comment: hi, some problems with network... 1) with socket # modprobe kvm # modprobe kvm-intel $ kvm -hda mydisk -m 192 -localtime -k fr -net nic -net socket,listen=:1234 KO segmentation fault with option -net nic -net socket,listen=:1234 OK with user or tap nic, no problem KO same problem with qemu KO same problem with -no-kvm option 2) with vde switch # modprobe kvm # modprobe kvm-intel # modprobe tun # tunctl -u toto # ifconfig tap0 172.20.0.1 netmask 255.255.255.0 up $ vde_switch -s /tmp/switch -daemon $ vde_plug2tap -s /tmp/switch -daemon tap0 $ vdekvm -hda mydisk -net nic -net vde,sock=/tmp/switch KO arg ,sock=/tmp/switch TUNGETIFF ioctl() failed: Invalid argument starts but without connected to the virtual switch KO samsung kernel: kvm: 6245: cpu0 unhandled wrmsr: 0xc0010117 data 0 in /var/log/everything.log OK with vdeqemu informations: # cat /proc/cpuinfo processor : 0 vendor_id : GenuineIntel cpu family : 6 model : 23 model name : Intel(R) Core(TM)2 Duo CPU T8100 @ 2.10GHz stepping: 6 cpu MHz : 2094.751 cache size : 3072 KB physical id : 0 siblings: 2 core id : 0 cpu cores : 2 apicid : 0 initial apicid : 0 fdiv_bug: no hlt_bug : no f00f_bug: no coma_bug: no fpu : yes fpu_exception : yes cpuid level : 10 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe lm constant_tsc arch_perfmon pebs bts pni dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm sse4_1 lahf_lm ida tpr_shadow vnmi flexpriority bogomips: 4191.06 clflush size: 64 power management: processor : 1 vendor_id : GenuineIntel cpu family : 6 model : 23 model name : Intel(R) Core(TM)2 Duo CPU T8100 @ 2.10GHz stepping: 6 cpu MHz : 2094.751 cache size : 3072 KB physical id : 0 siblings: 2 core id : 1 cpu cores : 2 apicid : 1 initial apicid : 1 fdiv_bug: no hlt_bug : no f00f_bug: no coma_bug: no fpu : yes fpu_exception : yes cpuid level : 10 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe lm constant_tsc arch_perfmon pebs bts pni dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm sse4_1 lahf_lm ida tpr_shadow vnmi flexpriority bogomips: 4191.06 clflush size: 64 power management: # kvm | grep -i version QEMU PC emulator version 0.9.1 (kvm-84), Copyright (c) 2003-2008 Fabrice Bellard # grep -i qemu /var/log/pacman.log | tail -n 1 installed qemu (0.10.0-1) # uname -a Linux samsung 2.6.28-ARCH #1 SMP PREEMPT Tue Mar 17 06:42:43 UTC 2009 i686 Intel(R) Core(TM)2 Duo CPU T8100 @ 2.10GHz GenuineIntel GNU/Linux # grep kernel /var/log/pacman.log | tail -n 1 [2009-03-26 18:27] upgraded kernel26 (2.6.28.7-2 - 2.6.28.8-1) upgraded qemu (0.10.0-1 - 0.10.1-1) KO segmentation fault still remain # dmesg | tail : qemu[5710]: segfault at 0 ip b7a73063 sp bfd59cfc error 4 in libc-2.9.so[b7a0+14] -- You can respond by visiting: https://sourceforge.net/tracker/?func=detailatid=893831aid=2733678group_id=180599 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 00/17] virtual-bus
Gregory Haskins wrote: 2) the vbus-proxy and kvm-guest patch go away 3) the kvm-host patch changes to work with coordination from the userspace-pci emulation for things like MSI routing 4) qemu will know to create some MSI shim 1:1 with whatever it instantiates on the bus (and can communicate changes Don't userstand. What's this MSI shim? Well, if the device model was an object in vbus down in the kernel, yet PCI emulation was up in qemu, presumably we would want something to handle things like PCI config-cycles up in userspace. Like, for instance, if the guest re-routes the MSI. The shim/proxy would handle the config-cycle, and then turn around and do an ioctl to the kernel to configure the change with the in-kernel device model (or the irq infrastructure, as required). Right, this is how it should work. All the gunk in userspace. But, TBH, I haven't really looked into whats actually required to make this work yet. I am just spitballing to try to find a compromise. One thing I thought of trying to get this generic is to use file descriptors as irq handles. So: - userspace exposes a PCI device (same as today) - guest configures its PCI IRQ (using MSI if it supports it) - userspace handles this by calling KVM_IRQ_FD which converts the irq to a file descriptor - userspace passes this fd to the kernel, or another userspace process - end user triggers guest irqs by writing to this fd We could do the same with hypercalls: - guest and host userspace negotiate hypercall use through PCI config space - userspace passes an fd to the kernel - whenever the guest issues an hypercall, the kernel writes the arguments to the fd - other end (in kernel or userspace) processes the hypercall No, you are confusing the front-end and back-end again ;) The back-end remains, and holds the device models as before. This is the vbus core. Today the front-end interacts with the hypervisor to render vbus specific devices. The proposal is to eliminate the front-end, and have the back end render the objects on the bus as PCI devices to the guest. I am not sure if I can make it work, yet. It needs more thought. It seems to me this already exists, it's the qemu device model. The host kernel doesn't need any knowledge of how the devices are connected, even if it does implement some of them. . I don't think you've yet set down what its advantages are. Being pure and clean doesn't count, unless you rip out PCI from all existing installed hardware and from Windows. You are being overly dramatic. No one has ever said we are talking about ripping something out. In fact, I've explicitly stated that PCI can coexist peacefully.Having more than one bus in a system is certainly not without precedent (PCI, scsi, usb, etc). Rather, PCI is PCI, and will always be. PCI was designed as a software-to-hardware interface. It works well for its intention. When we do full emulation of guests, we still do PCI so that all that software that was designed to work software-to-hardware still continue to work, even though technically its now software-to-software. When we do PV, on the other hand, we no longer need to pretend it is software-to-hardware. We can continue to use an interface designed for software-to-hardware if we choose, or we can use something else such as an interface designed specifically for software-to-software. As I have stated, PCI was designed with hardware constraints in mind. What if I don't want to be governed by those constraints? I'd agree with all this if I actually saw a constraint in PCI. But I don't. What if I don't want an interrupt per device (I don't)? Don't. Though I thing you do, even multiple interrupts per device. What do I need BARs for (I don't)? Don't use them. Is a PCI PIO address relevant to me (no, hypercalls are more direct)? Etc. Its crap I dont need. So use hypercalls. All I really need is a way to a) discover and enumerate devices, preferably dynamically (hotswap), and b) a way to communicate with those devices. I think you are overstating the the importance that PCI plays in (a), and are overstating the complexity associated with doing an alternative. Given that we have PCI, why would we do an alternative? It works, it works with Windows, the nasty stuff is in userspace. Why expend effort on an alternative? Instead make it go faster. I think you are understating the level of hackiness required to continue to support PCI as we move to new paradigms, like in-kernel models. The kernel need know nothing about PCI, so I don't see how you work this out. And I think I have already stated that I can establish a higher degree of flexibility, and arguably, performance for (b). You've stated it, but failed to provide arguments for it. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a
Re: [RFC PATCH 00/17] virtual-bus
Gregory Haskins wrote: You don't gain simplicity by adding things. But you are failing to account for the fact that we still have to add something for PCI if we go with something like the in-kernel model. Its nice for the userspace side because a) it was already in qemu, and b) we need it for proper guest support. But we don't presumably have it for this new thing, so something has to be created (unless this support is somehow already there and I don't know it?) No, a virtio server in the kernel would know nothing about PCI. Userspace would handle the PCI interface and configure the kernel. That way we can reuse the kernel part for lguest and s390. Optimization: Most of PCI (in our context) deals with configuration. So removing it doesn't optimize anything, unless you're counting hotplugs-per-second or something. Most, but not all ;) (Sorry, you left the window open on that one). What about IRQ routing? That's already in the kernel. What if I want to coalesce interrupts to minimize injection overhead? How do I do that in PCI? It has nothing to do with PCI. It has to do with the device/guest protocol. And virtio already does that (badly, in the case of network tx). How do I route those interrupts in an arbitrarily nested fashion, say, to a guest userspace? That's a guest problem. kvm delivers an interrupt; if the guest knows how to service it in userspace, great. What about scale? What if Herbet decides to implement a 2048 ring MQ device ;) Theres no great way to do that in x86 with PCI, yet I can do it in vbus. (And yes, I know, this is ridiculous..just wanting to get you thinking) I don't see why you can't do 2048 (or even 2049) rings with PCI. You'd point some config space address at a 'ring descriptor table' and that's it. There is be no problem supporting an in-kernel host virtio endpoint with the existing guest/host ABI. Nothing in the ABI assumes the host endpoint is in userspace. Nothing in the implementation requires us to move any of the PCI stuff into the kernel. Well, thats not really true. If the device is a PCI device, there is *some* stuff that has to go into the kernel. Not an ICH model or anything, but at least an ability to interact with userspace for config-space changes, etc. Config space changes go to userspace anyway. You'd need an interface to let userspace configure the kernel, but that's true for every device in the kernel. And you don't want to let the guest configure the kernel directly, you want userspace to be able to keep control of things. To avoid reiterating, please be specific about these advantages. We are both reading the same thread, right? Using different languages? Last time we measured, hypercall overhead was the same as pio overhead. Both vmx and svm decode pio completely (except for string pio ...) Not on my woodcrests last time I looked, but I'll check again. On woodcrests too. See vmx.c:handle_io(). True, PCI interrupts suck. But this was fixed with MSI. Why fix it again? As I stated, I don't like the constraints in place even by MSI (though that is definately a step in the right direction). Which constraints? With vbus I can have a device that has an arbitrary number of shm regions (limited by memory, of course), So you can with PCI. each with an arbitrarily routed signal path that is limited by a u64, even on x86. There are still only 224 vectors per vcpu. Each region can be signaled bidirectionally and masked with a simple local memory write. They can be declared on the fly, allowing for the easy expression of things like nested devices or or other dynamic resources. The can be routed across various topologies, such as IRQs or posix signals, even across multiple hops in a single path. How do I do that in PCI? Not what this nesting means. If I understand the rest, I think you can do it. What does masking an interrupt look like? It's a protocol between the device and the guest. PCI doesn't specify it. So you can use a bit in shared memory if you like. Again, for the nested case? What's that? Interrupt acknowledgment cycles? Standard for the platform. Again it's outside the scope of PCI. One of my primary design objectives with vbus was to a) reduce the signaling as much as possible, and b) reduce the cost of signaling. That is why I do things like use explicit hypercalls, aggregated interrupts, bidir napi to mitigate signaling, the shm_signal::pending mitigation, and avoiding going to userspace by running in the kernel. All of these things together help to form what I envision would be a maximum performance transport. Not all of these tricks are interdependent (for instance, the bidir + full-duplex threading that I do can be done in userspace too, as discussed). They are just the collective design elements that I think we need to make a guest perform very close to
Re: cr3 OOS optimisation breaks 32-bit GNU/kFreeBSD guest
On Sun, Apr 05, 2009 at 11:41:39AM +0300, Avi Kivity wrote: Marcelo Tosatti wrote: The problem is when the page is unreachable due to a higher level path being unlinked. Say: level 4 - level 3 . level 2 - level 1 (global unsync) The dot there means level 3 is not linked to level 2, so invlpg can't reach the global unsync at level 1. kvm_mmu_get_page does sync pages when it finds them, so the code is already safe for the linking a page which contains global ptes case you mention above. The recursive resync ignores global pages (or it would hit them on cr3 switch too). But we have a bigger problem, invlpg can miss even if nothing is unlinked: access address x through global pte - pte instantiated into spte switch to cr3 where x is not mapped, or mapped differently write to pte - no fault since pte is unsync invlpg x - no way this can hit the spte switch cr3 back access address x - use old spte Here's one way to make this work: - add a hash of global pagetables, indexed by virtual address instead of the pagetable's gfn - invlpg checks this hash in addition to the recursive walk We'd need to make the virtual address part of sp-role to avoid needing to link the same page multiple times in the virtual address hash. Humpf, yes. It seems its too expensive/complex to handle this, for such small gain (~= 2% on AIM7 with RHEL3 guest). Are you okay with just disabling the global pages optimization? -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: cr3 OOS optimisation breaks 32-bit GNU/kFreeBSD guest
Marcelo Tosatti wrote: Here's one way to make this work: - add a hash of global pagetables, indexed by virtual address instead of the pagetable's gfn - invlpg checks this hash in addition to the recursive walk We'd need to make the virtual address part of sp-role to avoid needing to link the same page multiple times in the virtual address hash. Humpf, yes. It seems its too expensive/complex to handle this, for such small gain (~= 2% on AIM7 with RHEL3 guest). Are you okay with just disabling the global pages optimization? Definitely to plug the hole; and probably for later as well, unless people cry out due to regressions. Please send it in two patches: one a trivial one to disable global page detection which can be sent to -stable as well, and a follow on which rips out the global page machinery until (and if) we decide to reimplement it correctly. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: AHCI?
tsuraan wrote: Is there any plan to add AHCI support to kvm? It seems like it would be an ideal alternative to the LSI SCSI driver, since AHCI is supported by 64-bit Solaris as well as nearly every other modern OS. No plan that I know of. Is the LSI scsi device not supported by Solaris? -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Can't boot guest with more than 3585MB when using large pages
Alex Williamson wrote: On Fri, 2009-04-03 at 20:28 -0300, Marcelo Tosatti wrote: Can you please try the following Thanks Marcelo, this seems to fix it. I tested up to a 30G guest with large pages. I've applied the patch, thanks. I keep thinking we need to do additional rounding when we allocate the file. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: persistent tun different virtual NICs dead guest network
(cc qemu-devel) Michael Tokarev wrote: Hello. 2 days debugging an.. issue here, and finally got it. To make the long and painful (it was for me anyway) story short... kvm provides a way to control various offload settings on the host side of the tun network device (I mean the `-net tap' setup) from within guest. I.e., guest can set/clear various offload bits according to its capabilities/wishes. The problem is that different virtual NICs as used by kvm/qemu expects and sets different offload bits for the virtual NIC. And sets only those bits which - as they think - differs from the default (all-off). This means that when changing virtual NIC model AND using persistent tun device, it's very likely to get inconsistent flags. For example, here's how the offload settings on the host looks like after using e1000 driver in guest (freshly created persistent tun device): rx-checksumming: on tx-checksumming: on scatter-gather: on tcp segmentation offload: on udp fragmentation offload: off generic segmentation offload: off large receive offload: off Here's the same setting when using virtio_net instead: rx-checksumming: on tx-checksumming: off scatter-gather: off tcp segmentation offload: off udp fragmentation offload: off generic segmentation offload: off large receive offload: off I.e., only rx-checksumming. When using virtio_net from 2.6.29, which supports LRO, it also turns on large receive offload. Now, say, I tried a host with e1000 driver, and it turned on tx, sg and tso bits. And now I'm trying to run a guest with new virtio-net NIC instead. It turns on lro bit, but the network does not work anyway: almost any packet that's being sent from host to the guest has incorrect checksum - because the NIC is marked as able to do tx-checksumming but it does not do it. The network is dead. Now, after trying that and this, not understanding what's going on etc, let's reboot back with e1000 NIC which worked a few minutes ago... just to discover that it does not work anymore too! Because previous attempt with virtio_net resulted in lro being on, but the driver does not support it! So now, we've non- working network again, and now, it does not matter which driver we'll try: neither of them will work because the offload settings are broken. It's more: one can't control this stuff from the host side using standard ethtool: it says that the operation is not supported (I wonder how kvm performs the settings changes). The solution here is to re-create the tun device before changing the virtual NIC model. But it isn't always possible, esp. when guests are being run from non-root user (where persistent tun devices are most useful). Can this be fixed somehow please? I think all the settings should be reset to 0 when opening the tun device. This should definitely be fixed. I'll look at writing a patch. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: persistent tun different virtual NICs dead guest network
Avi Kivity wrote: I think all the settings should be reset to 0 when opening the tun device. This should definitely be fixed. I'll look at writing a patch. Okay, that's not in upstream qemu, so I committed a fix to kvm-userspace.git. Attached if you want to test it. -- error compiling committee.c: too many arguments to function From 25971710409c374e9486c960c297f324a9164a65 Mon Sep 17 00:00:00 2001 From: Avi Kivity a...@redhat.com Date: Sun, 5 Apr 2009 15:08:55 +0300 Subject: [PATCH] kvm: qemu: clear tap features on initialization tap features change how tap interprets data, so they must be cleared on initialization to prevent old settings from interfering with new guest instances. Signed-off-by: Avi Kivity a...@redhat.com --- qemu/net.c |1 + 1 files changed, 1 insertions(+), 0 deletions(-) diff --git a/qemu/net.c b/qemu/net.c index d753fa0..703d01c 100644 --- a/qemu/net.c +++ b/qemu/net.c @@ -930,6 +930,7 @@ static TAPState *net_tap_fd_init(VLANState *vlan, #endif #ifdef TUNSETOFFLOAD s-vc-set_offload = tap_set_offload; +tap_set_offload(s-vc, 0, 0, 0, 0); #endif qemu_set_fd_handler2(s-fd, tap_can_send, tap_send, NULL, s); snprintf(s-vc-info_str, sizeof(s-vc-info_str), fd=%d, fd); -- 1.6.0.6
Re: [PATCH] fix call near emulation
Gleb Natapov wrote: The length of pushed on to the stack return address depends on operand size not address size. Applied, thanks. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] add tests for short/near Jcc and call instruction emulation
Gleb Natapov wrote: Signed-off-by: Gleb Natapov g...@redhat.com Applied, thanks. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] test long JMP emulation
Gleb Natapov wrote: Goes on top of Jcc tests patch. Applied, thanks. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Fix display breakage when resizing the screen
Avi Kivity wrote: When the vga resolution changes, a new display surface is not allocated immediately; instead that is deferred until the next update. However, if we're running without a display client attached, that won't happen and the next bitblt is likely to cause a segfault by overflowing the display surface. Fix by reallocating the display immediately when the resolution changes. Tested with (Windows|Linux) x (cirrus|std) x (curses|sdl). Signed-off-by: Avi Kivity a...@redhat.com This patch breaks VC switching with -curses. Regards, Anthony Liguori -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/9] Add decoding of 16bit second immediate argument.
Such as segment number in lcall/ljmp Signed-off-by: Gleb Natapov g...@redhat.com --- arch/x86/kvm/x86_emulate.c |7 +++ 1 files changed, 7 insertions(+), 0 deletions(-) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index d7c9f6f..c015063 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -76,6 +76,7 @@ #define Src2CL (129) #define Src2ImmByte (229) #define Src2One (329) +#define Src2Imm16 (429) #define Src2Mask(729) enum { @@ -1072,6 +1073,12 @@ done_prefixes: c-src2.bytes = 1; c-src2.val = insn_fetch(u8, 1, c-eip); break; + case Src2Imm16: + c-src2.type = OP_IMM; + c-src2.ptr = (unsigned long *)c-eip; + c-src2.bytes = 2; + c-src2.val = insn_fetch(u16, 2, c-eip); + break; case Src2One: c-src2.bytes = 1; c-src2.val = 1; -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/9] Add lcall decoding.
No emulation yet. Signed-off-by: Gleb Natapov g...@redhat.com --- arch/x86/kvm/x86_emulate.c |3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index c015063..fe0dec2 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -154,7 +154,8 @@ static u32 opcode_table[256] = { /* 0x90 - 0x97 */ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, /* 0x98 - 0x9F */ - 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, + 0, 0, SrcImm | Src2Imm16 | ImplicitOps, 0, ImplicitOps | Stack, + ImplicitOps | Stack, 0, 0, /* 0xA0 - 0xA7 */ ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/9] Complete short/near jcc decoding in decode stage.
Signed-off-by: Gleb Natapov g...@redhat.com --- arch/x86/kvm/x86_emulate.c | 50 1 files changed, 18 insertions(+), 32 deletions(-) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index e7e4db1..1790933 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -136,11 +136,15 @@ static u32 opcode_table[256] = { SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ /* 0x70 - 0x77 */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps | SrcImmByte, ImplicitOps | SrcImmByte, + ImplicitOps | SrcImmByte, ImplicitOps | SrcImmByte, + ImplicitOps | SrcImmByte, ImplicitOps | SrcImmByte, + ImplicitOps | SrcImmByte, ImplicitOps | SrcImmByte, /* 0x78 - 0x7F */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps | SrcImmByte, ImplicitOps | SrcImmByte, + ImplicitOps | SrcImmByte, ImplicitOps | SrcImmByte, + ImplicitOps | SrcImmByte, ImplicitOps | SrcImmByte, + ImplicitOps | SrcImmByte, ImplicitOps | SrcImmByte, /* 0x80 - 0x87 */ Group | Group1_80, Group | Group1_81, Group | Group1_82, Group | Group1_83, @@ -232,10 +236,12 @@ static u32 twobyte_table[256] = { /* 0x70 - 0x7F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 - 0x8F */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + SrcImm | ImplicitOps, SrcImm | ImplicitOps, SrcImm | ImplicitOps, + SrcImm | ImplicitOps, SrcImm | ImplicitOps, SrcImm | ImplicitOps, + SrcImm | ImplicitOps, SrcImm | ImplicitOps, SrcImm | ImplicitOps, + SrcImm | ImplicitOps, SrcImm | ImplicitOps, SrcImm | ImplicitOps, + SrcImm | ImplicitOps, SrcImm | ImplicitOps, SrcImm | ImplicitOps, + SrcImm | ImplicitOps, /* 0x90 - 0x9F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0 - 0xA7 */ @@ -1539,13 +1545,10 @@ special_insn: return -1; } return 0; - case 0x70 ... 0x7f: /* jcc (short) */ { - int rel = insn_fetch(s8, 1, c-eip); - + case 0x70 ... 0x7f: /* jcc (short) */ if (test_cc(c-b, ctxt-eflags)) - jmp_rel(c, rel); + jmp_rel(c, c-src.val); break; - } case 0x80 ... 0x83: /* Grp1 */ switch (c-modrm_reg) { case 0: @@ -2031,28 +2034,11 @@ twobyte_insn: if (!test_cc(c-b, ctxt-eflags)) c-dst.type = OP_NONE; /* no writeback */ break; - case 0x80 ... 0x8f: /* jnz rel, etc*/ { - long int rel; - - switch (c-op_bytes) { - case 2: - rel = insn_fetch(s16, 2, c-eip); - break; - case 4: - rel = insn_fetch(s32, 4, c-eip); - break; - case 8: - rel = insn_fetch(s64, 8, c-eip); - break; - default: - DPRINTF(jnz: Invalid op_bytes\n); - goto cannot_emulate; - } + case 0x80 ... 0x8f: /* jnz rel, etc*/ if (test_cc(c-b, ctxt-eflags)) - jmp_rel(c, rel); + jmp_rel(c, c-src.val); c-dst.type = OP_NONE; break; - } case 0xa3: bt: /* bt */ c-dst.type = OP_NONE; -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/9] Complete decoding of call near in decode stage.
Signed-off-by: Gleb Natapov g...@redhat.com --- arch/x86/kvm/x86_emulate.c | 15 ++- 1 files changed, 2 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 1790933..3c23af0 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -196,7 +196,7 @@ static u32 opcode_table[256] = { SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* 0xE8 - 0xEF */ - ImplicitOps | Stack, SrcImm | ImplicitOps, + SrcImm | ImplicitOps | Stack, SrcImm | ImplicitOps, SrcImm | Src2Imm16 | ImplicitOps, SrcImmByte | ImplicitOps, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, @@ -1789,18 +1789,7 @@ special_insn: io_dir_in = 0; goto do_io; case 0xe8: /* call (near) */ { - long int rel; - switch (c-op_bytes) { - case 2: - rel = insn_fetch(s16, 2, c-eip); - break; - case 4: - rel = insn_fetch(s32, 4, c-eip); - break; - default: - DPRINTF(Call: Invalid op_bytes\n); - goto cannot_emulate; - } + long int rel = c-src.val; c-src.val = (unsigned long) c-eip; jmp_rel(c, rel); emulate_push(ctxt); -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 6/9] Completely decode in/out at decoding stage.
Signed-off-by: Gleb Natapov g...@redhat.com --- arch/x86/kvm/x86_emulate.c | 10 ++ 1 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 3c23af0..cf27e62 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -193,8 +193,10 @@ static u32 opcode_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, /* 0xE0 - 0xE7 */ 0, 0, 0, 0, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, + SrcNone | ByteOp | SrcImmByte | ImplicitOps, + SrcNone | SrcImmByte | ImplicitOps, + SrcNone | ByteOp | SrcImmByte | ImplicitOps, + SrcNone | SrcImmByte | ImplicitOps, /* 0xE8 - 0xEF */ SrcImm | ImplicitOps | Stack, SrcImm | ImplicitOps, SrcImm | Src2Imm16 | ImplicitOps, SrcImmByte | ImplicitOps, @@ -1780,12 +1782,12 @@ special_insn: break; case 0xe4: /* inb */ case 0xe5: /* in */ - port = insn_fetch(u8, 1, c-eip); + port = c-src.val; io_dir_in = 1; goto do_io; case 0xe6: /* outb */ case 0xe7: /* out */ - port = insn_fetch(u8, 1, c-eip); + port = c-src.val; io_dir_in = 0; goto do_io; case 0xe8: /* call (near) */ { -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 7/9] Decode soft interrupt instructions.
Do not emulate them yet. Signed-off-by: Gleb Natapov g...@redhat.com --- arch/x86/kvm/x86_emulate.c |3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index cf27e62..fcaad4b 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -184,7 +184,8 @@ static u32 opcode_table[256] = { 0, ImplicitOps | Stack, 0, 0, ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, /* 0xC8 - 0xCF */ - 0, 0, 0, ImplicitOps | Stack, 0, 0, 0, 0, + 0, 0, 0, ImplicitOps | Stack, ImplicitOps, ImplicitOps | SrcImmByte, + ImplicitOps, ImplicitOps, /* 0xD0 - 0xD7 */ ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 8/9] Add new mode of instruction emulation: skip.
In the new mode instruction is decoded, but not executed. The EIP is moved to point after the instruction. Signed-off-by: Gleb Natapov g...@redhat.com --- arch/x86/include/asm/kvm_host.h |1 + arch/x86/kvm/x86.c |5 + 2 files changed, 6 insertions(+), 0 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3fc4623..e672ca5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -565,6 +565,7 @@ enum emulation_result { #define EMULTYPE_NO_DECODE (1 0) #define EMULTYPE_TRAP_UD (1 1) +#define EMULTYPE_SKIP (1 2) int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, unsigned long cr2, u16 error_code, int emulation_type); void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1d9a312..32c7b8f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2411,6 +2411,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu, } } + if (emulation_type EMULTYPE_SKIP) { + kvm_rip_write(vcpu, vcpu-arch.emulate_ctxt.decode.eip); + return EMULATE_DONE; + } + r = x86_emulate_insn(vcpu-arch.emulate_ctxt, emulate_ops); if (vcpu-arch.pio.string) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 9/9] [AMD] Skip instruction on a task switch only when appropriate.
If a task switch was initiated because off a task gate in IDT and IDT was accessed because of an external even the instruction should not be skipped. Signed-off-by: Gleb Natapov g...@redhat.com --- arch/x86/kvm/svm.c | 11 +-- 1 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 3ffb695..053f3c5 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1826,6 +1826,7 @@ static int task_switch_interception(struct vcpu_svm *svm, int reason; int int_type = svm-vmcb-control.exit_int_info SVM_EXITINTINFO_TYPE_MASK; + int int_vec = svm-vmcb-control.exit_int_info SVM_EVTINJ_VEC_MASK; tss_selector = (u16)svm-vmcb-control.exit_info_1; @@ -1841,8 +1842,14 @@ static int task_switch_interception(struct vcpu_svm *svm, reason = TASK_SWITCH_CALL; - if (reason != TASK_SWITCH_GATE || int_type == SVM_EXITINTINFO_TYPE_SOFT) - skip_emulated_instruction(svm-vcpu); + if (reason != TASK_SWITCH_GATE || + int_type == SVM_EXITINTINFO_TYPE_SOFT || + (int_type == SVM_EXITINTINFO_TYPE_EXEPT +(int_vec == OF_VECTOR || int_vec == BP_VECTOR))) { + if (emulate_instruction(svm-vcpu, kvm_run, 0, 0, + EMULTYPE_SKIP) != EMULATE_DONE) + return 0; + } return kvm_task_switch(svm-vcpu, tss_selector, reason); } -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 00/17] virtual-bus
Rusty Russell wrote: On Thursday 02 April 2009 02:40:29 Anthony Liguori wrote: Rusty Russell wrote: As you point out, 350-450 is possible, which is still bad, and it's at least partially caused by the exit to userspace and two system calls. If virtio_net had a backend in the kernel, we'd be able to compare numbers properly. I doubt the userspace exit is the problem. On a modern system, it takes about 1us to do a light-weight exit and about 2us to do a heavy-weight exit. A transition to userspace is only about ~150ns, the bulk of the additional heavy-weight exit cost is from vcpu_put() within KVM. Just to inject some facts, servicing a ping via tap (ie host-guest then guest-host response) takes 26 system calls from one qemu thread, 7 from another (see strace below). Judging by those futex calls, multiple context switches, too. N.B. we're not optimized for latency today. With the right infrastructure in userspace, I'm confident we could get this down. What we need is: 1) Lockless MMIO/PIO dispatch (there should be two IO registration interfaces, a new lockless one and the legacy one) 2) A virtio-net thread that's independent of the IO thread. It would be interesting to count the number of syscalls required in the lguest path since that should be a lot closer to optimal. Regards, Anthony Liguori -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: AHCI?
No plan that I know of. Is the LSI scsi device not supported by Solaris? Yeah, there was a short thread about it last July (http://www.mail-archive.com/kvm@vger.kernel.org/msg01633.html). The jist of it is that 64-bit OpenSolaris no longer supports the 53c895a SCSI controller. According to that thread, the 53c1010 is what VMWare uses, and OpenSolaris still uses that. I don't really understand how a VM exposes a virtual device to a hosted OS, but I assume that it involves looking at a driver for a piece of hardware and then writing some code that emulates the behaviour expected by that driver? I would think AHCI would be a great way to go, if that's the case, since it would be an actual implementation of a standard instead of an arbitrary card. Does kvm use the qemu source for its drivers, or is there a separate source tree for it now? I could at least have a look at the LSI driver so I have some idea what I'm talking about... -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Re: [PATCH] Fix display breakage when resizing the screen
Anthony Liguori wrote: Avi Kivity wrote: When the vga resolution changes, a new display surface is not allocated immediately; instead that is deferred until the next update. However, if we're running without a display client attached, that won't happen and the next bitblt is likely to cause a segfault by overflowing the display surface. Fix by reallocating the display immediately when the resolution changes. Tested with (Windows|Linux) x (cirrus|std) x (curses|sdl). Signed-off-by: Avi Kivity a...@redhat.com This patch breaks VC switching with -curses. Can someone explain what DisplaySurface::width means when using curses? It is initialized to a pixel value: ds-surface = qemu_create_displaysurface_from(640, 400, 0, 0, (uint8_t*) screen); then read in from the current surface: static void curses_resize(DisplayState *ds) { if (ds_get_width(ds) == gwidth ds_get_height(ds) == gheight) return; gwidth = ds_get_width(ds); gheight = ds_get_height(ds); curses_calc_pad(); ds-surface-width = width * FONT_WIDTH; ds-surface-height = height * FONT_HEIGHT; } But curses_calc_pad() does static void curses_calc_pad(void) { if (is_fixedsize_console()) { width = gwidth; height = gheight; } else { width = COLS; height = LINES; } If !is_fixedsize_console(), then the global width takes on a character cell count, later multiplied by FONT_WIDTH to become a pixel value again. But if is_fixedsize_console() is true (which happens to be the case here), then the global width is a pixel value (from gwidth), and when multiplied by FONT_WIDTH it becomes nonsense. Repeated calls to curses_resize() will inflate the value to hell. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Re: [PATCH] Fix display breakage when resizing the screen
Anthony Liguori wrote: This patch breaks VC switching with -curses. The attached incremental fixes it, by basically replicating the previous behaviour. I'll follow up with a new combined patch. -- error compiling committee.c: too many arguments to function diff --git a/hw/vga.c b/hw/vga.c index 404450f..54d0246 100644 --- a/hw/vga.c +++ b/hw/vga.c @@ -1631,6 +1631,9 @@ static void vga_update_resolution_graphics(VGAState *s) s-multi_run != multi_run || s-multi_scan != multi_scan || s-want_full_update) { +if (s-ds-surface-pf.depth == 0) { +goto dont_touch_display_surface; +} #if defined(WORDS_BIGENDIAN) == defined(TARGET_WORDS_BIGENDIAN) if (depth == 16 || depth == 32) { #else @@ -1647,6 +1650,7 @@ static void vga_update_resolution_graphics(VGAState *s) } else { qemu_console_resize(s-ds, disp_width, height); } +dont_touch_display_surface: s-last_scr_width = disp_width; s-last_scr_height = height; s-last_width = disp_width; @@ -1668,7 +1672,17 @@ static void vga_update_resolution_text(VGAState *s) cw != s-last_cw || cheight != s-last_ch || s-last_depth) { s-last_scr_width = width * cw; s-last_scr_height = height * cheight; -qemu_console_resize(s-ds, s-last_scr_width, s-last_scr_height); +if (s-ds-surface-pf.depth != 0) { +qemu_console_resize(s-ds, s-last_scr_width, s-last_scr_height); +} else { +/* + * curses expects width and height to be in character cell + * dimensions, not pixels. + */ +s-ds-surface-width = width; +s-ds-surface-height = height; +dpy_resize(s-ds); +} s-last_depth = 0; s-last_width = width; s-last_height = height;
[PATCH] Fix display breakage when resizing the screen (v2)
When the vga resolution changes, a new display surface is not allocated immediately; instead that is deferred until the next update. However, if we're running without a display client attached, that won't happen and the next bitblt is likely to cause a segfault by overflowing the display surface. Fix by reallocating the display immediately when the resolution changes. Tested with (Windows|Linux) x (cirrus|std) x (curses|sdl). Changes from v1: - fix segfault when switching virtual consoles with curses Signed-off-by: Avi Kivity a...@redhat.com --- hw/cirrus_vga.c | 11 ++- hw/vga.c| 275 +++ hw/vga_int.h|4 + 3 files changed, 170 insertions(+), 120 deletions(-) diff --git a/hw/cirrus_vga.c b/hw/cirrus_vga.c index 08fd4c2..223008e 100644 --- a/hw/cirrus_vga.c +++ b/hw/cirrus_vga.c @@ -1392,6 +1392,8 @@ cirrus_hook_write_sr(CirrusVGAState * s, unsigned reg_index, int reg_value) break; } +vga_update_resolution((VGAState *)s); + return CIRRUS_HOOK_HANDLED; } @@ -1419,6 +1421,7 @@ static void cirrus_write_hidden_dac(CirrusVGAState * s, int reg_value) #endif } s-cirrus_hidden_dac_lockindex = 0; +vga_update_resolution((VGAState *)s); } /*** @@ -1705,6 +1708,8 @@ cirrus_hook_write_cr(CirrusVGAState * s, unsigned reg_index, int reg_value) break; } +vga_update_resolution((VGAState *)s); + return CIRRUS_HOOK_HANDLED; } @@ -2830,6 +2835,7 @@ static void vga_ioport_write(void *opaque, uint32_t addr, uint32_t val) if (s-ar_flip_flop == 0) { val = 0x3f; s-ar_index = val; +vga_update_resolution((VGAState *)s); } else { index = s-ar_index 0x1f; switch (index) { @@ -2923,6 +2929,7 @@ static void vga_ioport_write(void *opaque, uint32_t addr, uint32_t val) /* can always write bit 4 of CR7 */ if (s-cr_index == 7) s-cr[7] = (s-cr[7] ~0x10) | (val 0x10); +vga_update_resolution((VGAState *)s); return; } switch (s-cr_index) { @@ -2951,6 +2958,7 @@ static void vga_ioport_write(void *opaque, uint32_t addr, uint32_t val) s-update_retrace_info((VGAState *) s); break; } +vga_update_resolution((VGAState *)s); break; case 0x3ba: case 0x3da: @@ -3157,7 +3165,8 @@ static int cirrus_vga_load(QEMUFile *f, void *opaque, int version_id) cirrus_update_memory_access(s); /* force refresh */ -s-graphic_mode = -1; +vga_update_resolution((VGAState *)s); +s-want_full_update = 1; cirrus_update_bank_ptr(s, 0); cirrus_update_bank_ptr(s, 1); return 0; diff --git a/hw/vga.c b/hw/vga.c index b1e4373..54d0246 100644 --- a/hw/vga.c +++ b/hw/vga.c @@ -36,6 +36,10 @@ //#define DEBUG_BOCHS_VBE +#define GMODE_TEXT 0 +#define GMODE_GRAPH1 +#define GMODE_BLANK 2 + /* force some bits to zero */ const uint8_t sr_mask[8] = { 0x03, @@ -393,6 +397,7 @@ static void vga_ioport_write(void *opaque, uint32_t addr, uint32_t val) if (s-ar_flip_flop == 0) { val = 0x3f; s-ar_index = val; +vga_update_resolution(s); } else { index = s-ar_index 0x1f; switch(index) { @@ -433,6 +438,7 @@ static void vga_ioport_write(void *opaque, uint32_t addr, uint32_t val) #endif s-sr[s-sr_index] = val sr_mask[s-sr_index]; if (s-sr_index == 1) s-update_retrace_info(s); +vga_update_resolution(s); break; case 0x3c7: s-dac_read_index = val; @@ -460,6 +466,7 @@ static void vga_ioport_write(void *opaque, uint32_t addr, uint32_t val) printf(vga: write GR%x = 0x%02x\n, s-gr_index, val); #endif s-gr[s-gr_index] = val gr_mask[s-gr_index]; +vga_update_resolution(s); break; case 0x3b4: case 0x3d4: @@ -475,6 +482,7 @@ static void vga_ioport_write(void *opaque, uint32_t addr, uint32_t val) /* can always write bit 4 of CR7 */ if (s-cr_index == 7) s-cr[7] = (s-cr[7] ~0x10) | (val 0x10); +vga_update_resolution(s); return; } switch(s-cr_index) { @@ -502,6 +510,7 @@ static void vga_ioport_write(void *opaque, uint32_t addr, uint32_t val) s-update_retrace_info(s); break; } +vga_update_resolution(s); break; case 0x3ba: case 0x3da: @@ -581,11 +590,13 @@ static void vbe_ioport_write_data(void *opaque, uint32_t addr, uint32_t val) if ((val = VBE_DISPI_MAX_XRES) ((val 7) == 0)) { s-vbe_regs[s-vbe_index] = val; } +vga_update_resolution(s); break; case VBE_DISPI_INDEX_YRES: if (val = VBE_DISPI_MAX_YRES) {
Re: AHCI?
tsuraan wrote: No plan that I know of. Is the LSI scsi device not supported by Solaris? Yeah, there was a short thread about it last July (http://www.mail-archive.com/kvm@vger.kernel.org/msg01633.html). The jist of it is that 64-bit OpenSolaris no longer supports the 53c895a SCSI controller. According to that thread, the 53c1010 is what VMWare uses, and OpenSolaris still uses that. I see. That sucks. I don't really understand how a VM exposes a virtual device to a hosted OS, but I assume that it involves looking at a driver for a piece of hardware and then writing some code that emulates the behaviour expected by that driver? The best way is to look at the device spec and write the emulation to conform to that. I would think AHCI would be a great way to go, if that's the case, since it would be an actual implementation of a standard instead of an arbitrary card. Does kvm use the qemu source for its drivers, or is there a separate source tree for it now? We use the qemu source (we have our own branch, but it's very close to qemu upstream). -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: AHCI?
I see. That sucks. Yeah... Is there any reason that kvm is hard-coded to only allow 4 IDE devices? Is it an issue with the BIOS implementation, or is it just a random limit? If there's some macro somewhere that defines NUM_IDE to 2, and I could change that to 5, then I'd be perfectly happy; I don't really care about SCSI vs. IDE, I just want to be able to use 9 drives with kvm instead of the 4 that I'm currently able to access. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: AHCI?
tsuraan wrote: I see. That sucks. Yeah... Is there any reason that kvm is hard-coded to only allow 4 IDE devices? Is it an issue with the BIOS implementation, or is it just a random limit? If there's some macro somewhere that defines NUM_IDE to 2, and I could change that to 5, then I'd be perfectly happy; I don't really care about SCSI vs. IDE, I just want to be able to use 9 drives with kvm instead of the 4 that I'm currently able to access. If you want that and good performance too write a virtio driver, and use virtio disk instead of IDE or SCSI. I don't think the IDE implementation in qemu supports more than four disks, that's an IDE limitation. No doubt it could be worked around by adding more IDE controllers, it's probably not a lot of code but quite tricky. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 00/17] virtual-bus
Anthony Liguori wrote: What we need is: 1) Lockless MMIO/PIO dispatch (there should be two IO registration interfaces, a new lockless one and the legacy one) Not sure exactly how much this is needed, since when there is no contention, locks are almost free (there's the atomic and cacheline bounce, but no syscall). For any long operations, we should drop the lock (of course we need some kind of read/write lock or rcu to avoid hotunplug or reconfiguration). 2) A virtio-net thread that's independent of the IO thread. Yes -- that saves us all the select() prologue (calculating new timeout) and the select() itself. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] extboot: silence compiler warning
Jan Kiszka wrote: Applied, thanks. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: AHCI?
If you want that and good performance too write a virtio driver, and use virtio disk instead of IDE or SCSI. Is there a place where the interface types are documented? The man page lists ide, scsi, sd, mtd, floppy, pflash, and virtio. There's a virtio page on the kvm wiki, and I think I can guess what ide, scsi, and floppy are, but where are the rest of them documented? I'm going through the list right now looking for a combination that will let solaris see all my disks, but a more informed approach might be more productive. I don't think the IDE implementation in qemu supports more than four disks, that's an IDE limitation. No doubt it could be worked around by adding more IDE controllers, it's probably not a lot of code but quite tricky. So it's down to hacking the VM code or writing a solaris driver. Fun :) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 00/17] virtual-bus
Avi Kivity wrote: Anthony Liguori wrote: What we need is: 1) Lockless MMIO/PIO dispatch (there should be two IO registration interfaces, a new lockless one and the legacy one) Not sure exactly how much this is needed, since when there is no contention, locks are almost free (there's the atomic and cacheline bounce, but no syscall). There should be no contention but I strongly suspect there is more often than we think. The IO thread can potentially hold the lock for a very long period of time. Take into consideration things like qcow2 metadata read/write, VNC server updates, etc.. For any long operations, we should drop the lock (of course we need some kind of read/write lock or rcu to avoid hotunplug or reconfiguration). 2) A virtio-net thread that's independent of the IO thread. Yes -- that saves us all the select() prologue (calculating new timeout) and the select() itself. In an ideal world, we could do the submission via io_submit in the VCPU context, not worry about the copy latency (because we're zero copy). Then our packet transmission latency is consistently low because the path is consistent and lockless. This is why dropping the lock is so important, it's not enough to usually have low latency. We need to try and have latency as low as possible as often as possible. Regards, Anthony Liguori -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: IOMMU setting
Yes it is AMD system, however KVM website does mention AMD iommu support KVM device passthrough. I want to make sure my system has this capability. Regards Eric Date: Sat, 4 Apr 2009 21:18:23 +0300 From: m...@il.ibm.com To: ericliu2...@hotmail.com CC: kvm@vger.kernel.org Subject: Re: IOMMU setting On Sat, Apr 04, 2009 at 12:16:50AM +, Eric Liu wrote: Is there a quick way to check if system has IOMMU enabled in Linux? I saw the following messages in /var/log/messages: Apr 3 21:03:16 kernel: PCI-DMA: Disabling AGP. Apr 3 21:03:16 kernel: PCI-DMA: aperture base @ f400 size 65536 KB Apr 3 21:03:16 kernel: init_memory_mapping: f400-f800 Apr 3 21:03:16 kernel: last_map_addr: f800 end: f800 Apr 3 21:03:16 kernel: PCI-DMA: using GART IOMMU. Apr 3 21:03:16 kernel: PCI-DMA: Reserving 64MB of IOMMU area in the AGP aperture Does this mean IOMMU is enabed? And i don't need anything like iommu=force in boot option, right? It means that you are running on an AMD system, and that this system has a GART. You need an isolation-capable IOMMU such as Intel's VT-d for KVM in-tree device passthrough. Cheers, Muli -- Muli Ben-Yehuda | m...@il.ibm.com | +972-4-8281080 Manager, Virtualization and Systems Architecture Master Inventor, IBM Haifa Research Laboratory SYSTOR 2009---The Israeli Experimental Systems Conference http://www.haifa.il.ibm.com/conferences/systor2009/ _ Rediscover HotmailĀ®: Get quick friend updates right in your inbox. http://windowslive.com/RediscoverHotmail?ocid=TXT_TAGLM_WL_HM_Rediscover_Updates1_042009-- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: IOMMU setting
On Sun, Apr 05, 2009 at 05:14:44PM +, Eric Liu wrote: Yes it is AMD system, however KVM website does mention AMD iommu support KVM device passthrough. I want to make sure my system has this capability. That's 'AMD IOMMU', as opposed to AMD GART, which is not an isolation-capable IOMMU. I don't know if any chipsets with AMD's IOMMU have been released yet. Cheers, Muli -- Muli Ben-Yehuda | m...@il.ibm.com | +972-4-8281080 Manager, Virtualization and Systems Architecture Master Inventor, IBM Haifa Research Laboratory SYSTOR 2009---The Israeli Experimental Systems Conference http://www.haifa.il.ibm.com/conferences/systor2009/ -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 0/2] disable unsync global page optimization
-- -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 1/2] KVM: MMU: disable global page optimization
Complexity to fix it not worthwhile the gains, as discussed in http://article.gmane.org/gmane.comp.emulators.kvm.devel/28649. Signed-off-by: Marcelo Tosatti mtosa...@redhat.com Index: kvm/arch/x86/kvm/mmu.c === --- kvm.orig/arch/x86/kvm/mmu.c +++ kvm/arch/x86/kvm/mmu.c @@ -1254,7 +1254,7 @@ static struct kvm_mmu_page *kvm_mmu_get_ pgprintk(%s: adding gfn %lx role %x\n, __func__, gfn, role.word); sp-gfn = gfn; sp-role = role; - sp-global = role.cr4_pge; + sp-global = 0; hlist_add_head(sp-hash_link, bucket); if (!direct) { if (rmap_write_protect(vcpu-kvm, gfn)) -- -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 2/2] KVM: MMU: remove global page optimization logic
Complexity to fix it not worthwhile the gains, as discussed in http://article.gmane.org/gmane.comp.emulators.kvm.devel/28649. Signed-off-by: Marcelo Tosatti mtosa...@redhat.com Index: kvm/arch/x86/include/asm/kvm_host.h === --- kvm.orig/arch/x86/include/asm/kvm_host.h +++ kvm/arch/x86/include/asm/kvm_host.h @@ -213,7 +213,6 @@ struct kvm_mmu_page { int multimapped; /* More than one parent_pte? */ int root_count; /* Currently serving as active root */ bool unsync; - bool global; unsigned int unsync_children; union { u64 *parent_pte; /* !multimapped */ @@ -395,7 +394,6 @@ struct kvm_arch{ */ struct list_head active_mmu_pages; struct list_head assigned_dev_head; - struct list_head oos_global_pages; struct iommu_domain *iommu_domain; struct kvm_pic *vpic; struct kvm_ioapic *vioapic; @@ -425,7 +423,6 @@ struct kvm_vm_stat { u32 mmu_recycled; u32 mmu_cache_miss; u32 mmu_unsync; - u32 mmu_unsync_global; u32 remote_tlb_flush; u32 lpages; }; @@ -640,7 +637,6 @@ void __kvm_mmu_free_some_pages(struct kv int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); -void kvm_mmu_sync_global(struct kvm_vcpu *vcpu); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); Index: kvm/arch/x86/kvm/mmu.c === --- kvm.orig/arch/x86/kvm/mmu.c +++ kvm/arch/x86/kvm/mmu.c @@ -1080,18 +1080,10 @@ static struct kvm_mmu_page *kvm_mmu_look return NULL; } -static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - list_del(sp-oos_link); - --kvm-stat.mmu_unsync_global; -} - static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { WARN_ON(!sp-unsync); sp-unsync = 0; - if (sp-global) - kvm_unlink_unsync_global(kvm, sp); --kvm-stat.mmu_unsync; } @@ -1254,7 +1246,6 @@ static struct kvm_mmu_page *kvm_mmu_get_ pgprintk(%s: adding gfn %lx role %x\n, __func__, gfn, role.word); sp-gfn = gfn; sp-role = role; - sp-global = 0; hlist_add_head(sp-hash_link, bucket); if (!direct) { if (rmap_write_protect(vcpu-kvm, gfn)) @@ -1652,11 +1643,7 @@ static int kvm_unsync_page(struct kvm_vc ++vcpu-kvm-stat.mmu_unsync; sp-unsync = 1; - if (sp-global) { - list_add(sp-oos_link, vcpu-kvm-arch.oos_global_pages); - ++vcpu-kvm-stat.mmu_unsync_global; - } else - kvm_mmu_mark_parents_unsync(vcpu, sp); + kvm_mmu_mark_parents_unsync(vcpu, sp); mmu_convert_notrap(sp); return 0; @@ -1683,21 +1670,12 @@ static int mmu_need_write_protect(struct static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pte_access, int user_fault, int write_fault, int dirty, int largepage, - int global, gfn_t gfn, pfn_t pfn, bool speculative, + gfn_t gfn, pfn_t pfn, bool speculative, bool can_unsync) { u64 spte; int ret = 0; u64 mt_mask = shadow_mt_mask; - struct kvm_mmu_page *sp = page_header(__pa(shadow_pte)); - - if (!global sp-global) { - sp-global = 0; - if (sp-unsync) { - kvm_unlink_unsync_global(vcpu-kvm, sp); - kvm_mmu_mark_parents_unsync(vcpu, sp); - } - } /* * We don't set the accessed bit, since we sometimes want to see @@ -1771,8 +1749,8 @@ set_pte: static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, -int *ptwrite, int largepage, int global, -gfn_t gfn, pfn_t pfn, bool speculative) +int *ptwrite, int largepage, gfn_t gfn, +pfn_t pfn, bool speculative) { int was_rmapped = 0; int was_writeble = is_writeble_pte(*shadow_pte); @@ -1801,7 +1779,7 @@ static void mmu_set_spte(struct kvm_vcpu was_rmapped = 1; } if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, - dirty, largepage, global, gfn, pfn, speculative, true)) { + dirty, largepage, gfn, pfn, speculative, true)) { if (write_fault) *ptwrite = 1; kvm_x86_ops-tlb_flush(vcpu); @@ -1849,7 +1827,7 @@ static int __direct_map(struct kvm_vcpu || (largepage iterator.level == PT_DIRECTORY_LEVEL)) {
Re: [PATCH] Fix display breakage when resizing the screen (v2)
Avi Kivity wrote: When the vga resolution changes, a new display surface is not allocated immediately; instead that is deferred until the next update. However, if we're running without a display client attached, that won't happen and the next bitblt is likely to cause a segfault by overflowing the display surface. Fix by reallocating the display immediately when the resolution changes. Tested with (Windows|Linux) x (cirrus|std) x (curses|sdl). Changes from v1: - fix segfault when switching virtual consoles with curses Signed-off-by: Avi Kivity a...@redhat.com Applied. Thanks. -- Regards, Anthony Liguori -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH -tip 0/6 V4] tracing: kprobe-based event tracer
On Fri, 03 Apr 2009 09:52:09 -0400 Masami Hiramatsu mhira...@redhat.com wrote: Vegard Nossum wrote: 2009/4/3 Ingo Molnar mi...@elte.hu: * Avi Kivity a...@redhat.com wrote: Ingo Molnar wrote: kvm has three requirements not needed by kprobes: - it wants to execute instructions, not just decode them, including generating faults where appropriate - it is performance critical - it needs to support 16-bit, 32-bit, and 64-bit instructions simultaneously If an arch/x86/ decoder/emulator gives me these I'll gladly switch to it. x86_emulate.c is high on my list of most disliked code. Well, this has to be driven from the KVM side as the kprobes use will only be for decoding so if it's modified from the kprobes side the KVM-only functionality might regress. So ... we can do the library decoder for kprobes purposes, and someone versed in the KVM emulator can then combine the two. Problem is, anyone versed in the kvm emulator will want to run as far away from this work as possible. Are you suggesting that the KVM emulator should never have been merged in the first place? ;-) Anyway, we'll make sure the kprobes/library decoder is as clean as possible - so it ought to be hackable and extensible without the risk of permanent brain damage. Mmiotrace and kmemcheck has decoding smarts too, and i think the sw-breakpoint injection code of KGDB could use it as well - so there's broader utility in all this. (Sorry in advance for jumping in -- my post may be irrelevant) Thank you for clarify your needs :-) For the record, kmemcheck requirements for an instruction decoder are these: For any instruction with memory operands, we need to know which are the operands (so for movl %eax, (%ebx) we need to combine the instruction with a struct pt_regs to get the actual address dereferenced, i.e. the contents of %ebx), and their sizes (for movzbl, the source operand is 8 bits, destination operand is 32 bits). For things like movsb, we need to be able to get both %esi and %edi. New decoder can give you the value of mod/rm(insn.modrm), operand size (insn.opnd_bytes), and immediate size (insn.immediate.nbytes) To get which register is used, you can decode modrm with MODRM_*() macros. mmiotrace additionally needs to know what the actual values read/written were, for instructions that read/write to memory (again, combined with a struct pt_regs). The decoder doesn't use any locks/shared memory, so you can use it in interrupt context, with pt_regs. Maybe this doesn't really say much, since this is what a generic instruction decoder would be able to do anyway. But kmemcheck and mmiotrace both have very special-purpose decoders. I don't really know what other decoders look like, but what I would wish for is this: Some macros for iterating the operands, where each operand has a type (e.g. input (for reads), output (for writes), target (for jumps), immediate address, immediate value, etc.), a size (in bits), and a way to evaluate the operand. So eval(op, regs) for op=%eax, it will return regs-eax; for op=4(%eax), it will return regs-eax + 4; for op=4 it will return 4, etc. Hmm, it's an interesting idea. I think operand classifying can be done by evaluating opcode and mod/rm. Both kmemcheck and mmiotrace could gain SMP support with instruction emulation, though it is strictly not necessary. In that case, though, we would not want to emulate fault handling, etc. (i.e. the fault should always be generated by the CPU itself). Not just emulation but address diversion, i.e. modifying the operation (not the text) before executing it. Mmiotrace could do something like this: 1. a blob calls ioremap 2. mmiotrace maps the MMIO area privately 3. the blob receives a dummy map from ioremap, that will generate page fault 4. the blob accesses the dummy map and raises a page fault 5. pf handler detects the dummy map 6. mmiotrace pf handler emulates the instruction and replaces the dummy address with the real MMIO address. 7. mmiotrace records the operation and the datum 8. go to step 4, or whatever This means mmiotrace would not have to fiddle with the page tables and page presence bits like it does now. As said, this would make mmiotrace SMP-proof, and also eliminate the die notifier (used for the instruction single stepping trap). IMO a big step from a hack to a tool. Getting rid of the custom instruction parser in mmiotrace would be a good step in itself. Avi Kivity noted, that the KVM emulator does almost everything. Does it allow also address diversion? I haven't looked at the KVM emulator since something like 2.6.25 or so, and I probably don't have time to work with it anyway, but I am very interested to hear how things evolve. Thanks. -- Pekka Paalanen http://www.iki.fi/pq/ -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo