[PATCH] viostor driver. switch to full-duplex mode.
repository: /home/vadimr/shares/kvm-guest-drivers-windows branch: master commit ed4b9ade27b56e9ee37461c2cf72e46d75633e9c Author: Vadim Rozenfeldvroze...@redhat.com Date: Wed Sep 23 11:28:48 2009 +0300 [PATCH] viostor driver. switch to full-duplex mode. Signed-off-by: Vadim Rozenfeldvroze...@redhat.com diff --git a/viostor/virtio_stor.c b/viostor/virtio_stor.c index e4acaa0..297949a 100644 --- a/viostor/virtio_stor.c +++ b/viostor/virtio_stor.c @@ -194,6 +194,7 @@ VirtIoFindAdapter( ConfigInfo-WmiDataProvider= FALSE; #ifdef USE_STORPORT ConfigInfo-MapBuffers = STOR_MAP_NON_READ_WRITE_BUFFERS; +ConfigInfo-SynchronizationModel = StorSynchronizeFullDuplex; #else ConfigInfo-MapBuffers = TRUE; #endif @@ -323,6 +324,8 @@ VirtIoFindAdapter( return SP_RETURN_ERROR; } +InitializeListHead(adaptExt-list_head); + return SP_RETURN_FOUND; } @@ -488,7 +491,7 @@ VirtIoStartIo( case SCSIOP_WRITE: { Srb-SrbStatus = SRB_STATUS_PENDING; if(!RhelDoReadWrite(DeviceExtension, Srb)) { -Srb-SrbStatus = SRB_STATUS_ABORTED; +Srb-SrbStatus = SRB_STATUS_BUSY; CompleteSRB(DeviceExtension, Srb); } return TRUE; @@ -561,7 +564,7 @@ VirtIoInterrupt( Srb-SrbStatus = SRB_STATUS_ERROR; break; } - + RemoveEntryList(vbr-list_entry); CompleteSRB(DeviceExtension, Srb); } } diff --git a/viostor/virtio_stor.h b/viostor/virtio_stor.h index 1c0dbb6..2d98738 100644 --- a/viostor/virtio_stor.h +++ b/viostor/virtio_stor.h @@ -78,12 +78,8 @@ typedef struct virtio_blk_outhdr { u64 sector; }blk_outhdr, *pblk_outhdr; -struct list_head { -struct list_head *next, *prev; -}; - typedef struct virtio_blk_req { -struct list_head list; +LIST_ENTRY list_entry; struct request *req; blk_outhdr out_hdr; u8 status; @@ -98,6 +94,7 @@ typedef struct _ADAPTER_EXTENSION { blk_configinfo; ULONG queue_depth; BOOLEAN dump_mode; +LIST_ENTRYlist_head; }ADAPTER_EXTENSION, *PADAPTER_EXTENSION; typedef struct _RHEL_SRB_EXTENSION { diff --git a/viostor/virtio_stor_hw_helper.c b/viostor/virtio_stor_hw_helper.c index 43bde5a..3c09259 100644 --- a/viostor/virtio_stor_hw_helper.c +++ b/viostor/virtio_stor_hw_helper.c @@ -15,22 +15,33 @@ #include virtio_stor_hw_helper.h #ifdef USE_STORPORT -BOOLEAN -RhelDoReadWrite(PVOID DeviceExtension, -PSCSI_REQUEST_BLOCK Srb) +BOOLEAN +SynchronizedAccessRoutine( +IN PVOID DeviceExtension, +IN PVOID Context +) { PADAPTER_EXTENSION adaptExt = (PADAPTER_EXTENSION)DeviceExtension; +PSCSI_REQUEST_BLOCK Srb = (PSCSI_REQUEST_BLOCK) Context; PRHEL_SRB_EXTENSION srbExt = (PRHEL_SRB_EXTENSION)Srb-SrbExtension; - if (adaptExt-pci_vq_info.vq-vq_ops-add_buf(adaptExt-pci_vq_info.vq, -srbExt-vbr.sg[0], - srbExt-out, srbExt-in, -srbExt-vbr) == 0) { +srbExt-vbr.sg[0], + srbExt-out, srbExt-in, +srbExt-vbr) == 0){ +InsertTailList(adaptExt-list_head,srbExt-vbr.list_entry); adaptExt-pci_vq_info.vq-vq_ops-kick(adaptExt-pci_vq_info.vq); return TRUE; } +StorPortBusy(DeviceExtension, 10); return FALSE; } + +BOOLEAN +RhelDoReadWrite(PVOID DeviceExtension, +PSCSI_REQUEST_BLOCK Srb) +{ +return StorPortSynchronizeAccess(DeviceExtension,SynchronizedAccessRoutine, (PVOID)Srb); +} #else BOOLEAN RhelDoReadWrite(PVOID DeviceExtension, @@ -83,6 +94,8 @@ RhelDoReadWrite(PVOID DeviceExtension, srbExt-vbr.sg[0], srbExt-out, srbExt-in, srbExt-vbr) == 0) { +//FIXME +InsertTailList(adaptExt-list_head,srbExt-vbr.list_entry); adaptExt-pci_vq_info.vq-vq_ops-kick(adaptExt-pci_vq_info.vq); return TRUE; } -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/5] Don't call svm_complete_interrupts for nested guests
Am 22.09.2009 um 18:26 schrieb Joerg Roedel j...@8bytes.org: On Fri, Sep 18, 2009 at 03:00:29PM +0200, Alexander Graf wrote: SVM has some cleanup code, that tries to reinject interrupts and exceptions when the guest didn't manage to deal with them yet. It basically transfers them to KVM internal state. Unfortunately, the internal state is reserved for the L1 guest state, so we shouldn't try to go through that logic when running a nested guest. When doing something the host KVM can handle, let's just reinject the event into the L2 guest, because we didn't touch its state anyways. I don't really understandt what problem this patch addresses. There are situations where we have events to reinject into the l2 guest directly. But the generic reinjection code works fine for it. The only problematic thing with it is that it implicitly relies on exit_int_info not to be changed in the exit cycle (which would be worth a comment). It si Joerg Signed-off-by: Alexander Graf ag...@suse.de --- arch/x86/kvm/svm.c | 18 ++ 1 files changed, 14 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f12a669..61efd13 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2349,7 +2349,10 @@ static int handle_exit(struct kvm_vcpu *vcpu) trace_kvm_exit(exit_code, svm-vmcb-save.rip); if (is_nested(svm)) { +struct vmcb_control_area *control = svm-vmcb-control; int vmexit; +int type; +int vec; nsvm_printk(nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx \n, exit_code, svm-vmcb-control.exit_info_1, @@ -2362,9 +2365,18 @@ static int handle_exit(struct kvm_vcpu *vcpu) if (vmexit == NESTED_EXIT_DONE) return 1; -} -svm_complete_interrupts(svm); +type = control-exit_int_info SVM_EXITINTINFO_TYPE_MASK; +vec = control-exit_int_info SVM_EXITINTINFO_VEC_MASK; +if ((type == SVM_EXITINTINFO_TYPE_INTR) || +((type == SVM_EXITINTINFO_TYPE_EXEPT) ! kvm_exception_is_soft(vec))) { +control-event_inj = control-exit_int_info; +control-event_inj_err = control-exit_int_info_err; +} +} else { +/* Don't interpret exit_info for nested guests */ +svm_complete_interrupts(svm); +} if (npt_enabled) { int mmu_reload = 0; @@ -2602,8 +2614,6 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) case SVM_EXITINTINFO_TYPE_EXEPT: /* In case of software exception do not reinject an exception vector, but re-execute and instruction instead */ -if (is_nested(svm)) -break; if (kvm_exception_is_soft(vector)) break; if (exitintinfo SVM_EXITINTINFO_VALID_ERR) { -- 1.6.0.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/5] Don't call svm_complete_interrupts for nested guests
Am 22.09.2009 um 18:26 schrieb Joerg Roedel j...@8bytes.org: On Fri, Sep 18, 2009 at 03:00:29PM +0200, Alexander Graf wrote: SVM has some cleanup code, that tries to reinject interrupts and exceptions when the guest didn't manage to deal with them yet. It basically transfers them to KVM internal state. Unfortunately, the internal state is reserved for the L1 guest state, so we shouldn't try to go through that logic when running a nested guest. When doing something the host KVM can handle, let's just reinject the event into the L2 guest, because we didn't touch its state anyways. I don't really understandt what problem this patch addresses. There are situations where we have events to reinject into the l2 guest directly. But the generic reinjection code works fine for it. The only problematic thing with it is that it implicitly relies on exit_int_info not to be changed in the exit cycle (which would be worth a comment). It simply tries to be too clever. Reevaluating exceptions won't work for example. Alex Joerg Signed-off-by: Alexander Graf ag...@suse.de --- arch/x86/kvm/svm.c | 18 ++ 1 files changed, 14 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f12a669..61efd13 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2349,7 +2349,10 @@ static int handle_exit(struct kvm_vcpu *vcpu) trace_kvm_exit(exit_code, svm-vmcb-save.rip); if (is_nested(svm)) { +struct vmcb_control_area *control = svm-vmcb-control; int vmexit; +int type; +int vec; nsvm_printk(nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx \n, exit_code, svm-vmcb-control.exit_info_1, @@ -2362,9 +2365,18 @@ static int handle_exit(struct kvm_vcpu *vcpu) if (vmexit == NESTED_EXIT_DONE) return 1; -} -svm_complete_interrupts(svm); +type = control-exit_int_info SVM_EXITINTINFO_TYPE_MASK; +vec = control-exit_int_info SVM_EXITINTINFO_VEC_MASK; +if ((type == SVM_EXITINTINFO_TYPE_INTR) || +((type == SVM_EXITINTINFO_TYPE_EXEPT) ! kvm_exception_is_soft(vec))) { +control-event_inj = control-exit_int_info; +control-event_inj_err = control-exit_int_info_err; +} +} else { +/* Don't interpret exit_info for nested guests */ +svm_complete_interrupts(svm); +} if (npt_enabled) { int mmu_reload = 0; @@ -2602,8 +2614,6 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) case SVM_EXITINTINFO_TYPE_EXEPT: /* In case of software exception do not reinject an exception vector, but re-execute and instruction instead */ -if (is_nested(svm)) -break; if (kvm_exception_is_soft(vector)) break; if (exitintinfo SVM_EXITINTINFO_VALID_ERR) { -- 1.6.0.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/5] Don't #VMEXIT(INTR) if we still have event_inj waiting
Am 22.09.2009 um 18:39 schrieb Joerg Roedel j...@8bytes.org: On Fri, Sep 18, 2009 at 03:00:30PM +0200, Alexander Graf wrote: Real hardware would first process the event_inj field and then notify the host that an interrupt is waiting. Does it really? I couldn't find this in the SVM spec. I didn't see it in the spec either, but that's what I saw real hardware do which is supported by the exit_info validity check in svm.c Let's do the same and just not EXIT_INTR if we have an event pending for the L2 guest. Anyway. I think this case is handled good enough with patch 5/5. This patch, to be complete must also enable single-steping to exit again after the first instruction of the exception handler has ran to inject the interrupt. But that would make the whole thing rather compĺicate d. The NMI injection path has logic for that, but for now it shouldn't hurt - we'll get an interrupt sooner or later. Signed-off-by: Alexander Graf ag...@suse.de --- arch/x86/kvm/svm.c |4 1 files changed, 4 insertions(+), 0 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 61efd13..28fcbd0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1401,6 +1401,10 @@ static inline int nested_svm_intr(struct vcpu_svm *svm) if (!(svm-vcpu.arch.hflags HF_HIF_MASK)) return 0; +/* We can't EXIT_INTR when we still have an event to inject */ +if (svm-vmcb-control.event_inj) +return 1; + svm-vmcb-control.exit_code = SVM_EXIT_INTR; if (nested_svm_exit_handled(svm)) { -- 1.6.0.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/5] Don't call svm_complete_interrupts for nested guests
On Wed, Sep 23, 2009 at 01:05:57AM -0700, Alexander Graf wrote: Am 22.09.2009 um 18:26 schrieb Joerg Roedel j...@8bytes.org: On Fri, Sep 18, 2009 at 03:00:29PM +0200, Alexander Graf wrote: SVM has some cleanup code, that tries to reinject interrupts and exceptions when the guest didn't manage to deal with them yet. It basically transfers them to KVM internal state. Unfortunately, the internal state is reserved for the L1 guest state, so we shouldn't try to go through that logic when running a nested guest. When doing something the host KVM can handle, let's just reinject the event into the L2 guest, because we didn't touch its state anyways. I don't really understandt what problem this patch addresses. There are situations where we have events to reinject into the l2 guest directly. But the generic reinjection code works fine for it. The only problematic thing with it is that it implicitly relies on exit_int_info not to be changed in the exit cycle (which would be worth a comment). It simply tries to be too clever. Reevaluating exceptions won't work for example. Can you elaborate? What do you mean by too clever and why reevaluating exceptions won't work? -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Update halted state from mpstate only in case of inkernel irq chip
On 09/22/2009 05:51 PM, Gleb Natapov wrote: Otherwise cpu is always unhalted by call to kvm_arch_get_registers() Applied, thanks. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Don't call kvm_cpu_synchronize_state() if there is no irqchip events to process.
On 09/22/2009 05:51 PM, Gleb Natapov wrote: Signed-off-by: Gleb Natapovg...@redhat.com Applied, thanks. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Don't call cpu_synchronize_state() in apic_init_reset()
On 09/22/2009 05:51 PM, Gleb Natapov wrote: Each caller of the function already calls it. Signed-off-by: Gleb Natapovg...@redhat.com --- hw/apic.c |1 - 1 files changed, 0 insertions(+), 1 deletions(-) diff --git a/hw/apic.c b/hw/apic.c index 3a2e128..a9d1fb8 100644 --- a/hw/apic.c +++ b/hw/apic.c @@ -488,7 +488,6 @@ void apic_init_reset(CPUState *env) if (!s) return; -cpu_synchronize_state(env); s-tpr = 0; s-spurious_vec = 0xff; s-log_dest = 0; Still, it's safer to live this in. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] qemu-kvm: fix no gsi routing warning
On 09/21/2009 10:50 PM, Michael S. Tsirkin wrote: On Mon, Sep 21, 2009 at 05:56:13PM +0300, Avi Kivity wrote: On 09/21/2009 05:51 PM, Michael S. Tsirkin wrote: When running on host kernel which does not let the guest manupulate the gsi routing, and user requested MSI-X to be enabled, we get the following warnings: kvm_msix_add: kvm_get_irq_route_gsi failed: No space left on device kvm_msix_update: kvm_update_routing_entry failed: Invalid argument What really happens is that we report a failure to allocate a vector to the guest, it will retry and finally disable MSI. Make this clearer by checking for gsi capability and warning about the error in a readable form. Can we disable msix What we do effectively disables msix. (or, abort qemu at startup and request that the user disable msix) if the kernel doesn't provide required features? It's better than a runtime error. Note it's a warning, not an error. We have a similar failure mode when we run out of gsi entries, and that can not be checked upfront. I prefer having it fail in the same way, so that this failure path in guest drivers is excercised and tested. I see, applied the patch, thanks. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Possible problem with -no-kvm-irqchip -no-kvm-pit qemu options detected during git daily testing
On 09/21/2009 10:24 PM, Lucas Meneghel Rodrigues wrote: Ok, tests ended, indeed a problem showed up - we still have the problem I reported with the options -no-kvm-irqchip -no-kvm-pit *when* are coupled with the option -smp 2. If -smp 2 is not present, qemu works fine with the options -no-kvm-irqchip -no-kvm-pit. An example of command line: /usr/local/autotest/tests/kvm/qemu -name 'vm1' -monitor unix:/tmp/monitor-20090918-131514-TdDQ,server,nowait -drive file=/usr/local/autotest/tests/kvm/images/fc9-64.qcow2,if=ide,boot=on -net nic,vlan=0 -net user,vlan=0 -m 512 -smp 2 -no-kvm-irqchip -no-kvm-pit -redir tcp:5000::22 -vnc :0 It's necessary to look further what could be causing this problem. I've applied yet another patch by Gleb that should fix this. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
buildbot failure in qemu-kvm on disable_kvm_x86_64_out_of_tree
The Buildbot has detected a new failure of disable_kvm_x86_64_out_of_tree on qemu-kvm. Full details are available at: http://buildbot.b1-systems.de/qemu-kvm/builders/disable_kvm_x86_64_out_of_tree/builds/19 Buildbot URL: http://buildbot.b1-systems.de/qemu-kvm/ Buildslave for this Build: b1_qemu_kvm_1 Build Reason: Build Source Stamp: [branch next] HEAD Blamelist: Avi Kivity a...@redhat.com,Gleb Natapov g...@redhat.com,Michael S. Tsirkin m...@redhat.com BUILD FAILED: failed git sincerely, -The Buildbot -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
buildbot failure in qemu-kvm on disable_kvm_x86_64_debian_5_0
The Buildbot has detected a new failure of disable_kvm_x86_64_debian_5_0 on qemu-kvm. Full details are available at: http://buildbot.b1-systems.de/qemu-kvm/builders/disable_kvm_x86_64_debian_5_0/builds/70 Buildbot URL: http://buildbot.b1-systems.de/qemu-kvm/ Buildslave for this Build: b1_qemu_kvm_1 Build Reason: Build Source Stamp: [branch next] HEAD Blamelist: Avi Kivity a...@redhat.com,Gleb Natapov g...@redhat.com,Michael S. Tsirkin m...@redhat.com BUILD FAILED: failed git sincerely, -The Buildbot -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
buildbot failure in qemu-kvm on disable_kvm_i386_debian_5_0
The Buildbot has detected a new failure of disable_kvm_i386_debian_5_0 on qemu-kvm. Full details are available at: http://buildbot.b1-systems.de/qemu-kvm/builders/disable_kvm_i386_debian_5_0/builds/71 Buildbot URL: http://buildbot.b1-systems.de/qemu-kvm/ Buildslave for this Build: b1_qemu_kvm_2 Build Reason: Build Source Stamp: [branch next] HEAD Blamelist: Avi Kivity a...@redhat.com,Gleb Natapov g...@redhat.com,Michael S. Tsirkin m...@redhat.com BUILD FAILED: failed git sincerely, -The Buildbot -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: Virtio net driver for Windows 2008 R2
Now :) You can compile the drivers from the git://git.kernel.org/pub/scm/virt/kvm/kvm-guest-drivers-windows.git Use Vista drivers for Win2008R2 if you use default build scripts. You can also use latest WDK and compile the code for Windows 2008 R2. Please keep in mind that you need to test sign the drivers for 64bit MS OSes. Kernel.org policies are preventing me from keeping test certificate in repository and signing the drivers automatically during build process. http://msdn.microsoft.com/en-us/library/aa906344.aspx Best regards, Yan. -Original Message- From: kvm-ow...@vger.kernel.org [mailto:kvm-ow...@vger.kernel.org] On Behalf Of carlopmart Sent: Monday, September 21, 2009 11:35 PM To: kvm@vger.kernel.org Subject: Virtio net driver for Windows 2008 R2 Hi all. When will be possible to test virtio net drivers for Windows 2008 R2 kvm guests? Thanks. -- CL Martinez carlopmart {at} gmail {d0t} com -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Test 802.1Q vlan of nic
Test 802.1Q vlan of nic, config it by vconfig command. 1) Create two VMs 2) Setup guests in different vlan by vconfig and test communication by ping using hard-coded ip address 3) Setup guests in same vlan and test communication by ping 4) Recover the vlan config Signed-off-by: Amos Kong ak...@redhat.com --- client/tests/kvm/kvm_tests.cfg.sample |6 +++ client/tests/kvm/tests/vlan_tag.py| 66 + 2 files changed, 72 insertions(+), 0 deletions(-) create mode 100644 client/tests/kvm/tests/vlan_tag.py diff --git a/client/tests/kvm/kvm_tests.cfg.sample b/client/tests/kvm/kvm_tests.cfg.sample index 285a38f..5a3f97d 100644 --- a/client/tests/kvm/kvm_tests.cfg.sample +++ b/client/tests/kvm/kvm_tests.cfg.sample @@ -145,6 +145,12 @@ variants: kill_vm = yes kill_vm_gracefully = no +- vlan_tag: install setup +type = vlan_tag +subnet2 = 192.168.123 +vlans = 10 20 +nic_mode = tap +nic_model = e1000 # NICs variants: diff --git a/client/tests/kvm/tests/vlan_tag.py b/client/tests/kvm/tests/vlan_tag.py new file mode 100644 index 000..2904276 --- /dev/null +++ b/client/tests/kvm/tests/vlan_tag.py @@ -0,0 +1,66 @@ +import logging, time +from autotest_lib.client.common_lib import error +import kvm_subprocess, kvm_test_utils, kvm_utils + +def run_vlan_tag(test, params, env): + +Test 802.1Q vlan of nic, config it by vconfig command. + +1) Create two VMs +2) Setup guests in different vlan by vconfig and test communication by ping + using hard-coded ip address +3) Setup guests in same vlan and test communication by ping +4) Recover the vlan config + +@param test: Kvm test object +@param params: Dictionary with the test parameters. +@param env: Dictionary with test environment. + + +vm = [] +session = [] +subnet2 = params.get(subnet2) +vlans = params.get(vlans).split() + +vm.append(kvm_test_utils.get_living_vm(env, %s % params.get(main_vm))) + +params_vm2 = params.copy() +params_vm2['image_snapshot'] = yes +params_vm2['kill_vm_gracefully'] = no +params_vm2[address_index] = int(params.get(address_index, 0))+1 +vm.append(vm[0].clone(vm2, params_vm2)) +kvm_utils.env_register_vm(env, vm2, vm[1]) +if not vm[1].create(): +raise error.TestError, VM 'vm[1]' create faild + +for i in range(2): +session.append(kvm_test_utils.wait_for_login(vm[i])) + +try: +vconfig_cmd = vconfig add eth0 %s;ifconfig eth0.%s %s.%s +if session[0].get_command_status(vconfig_cmd % (vlans[0], +vlans[0], +subnet2, +11)) != 0 or \ + session[1].get_command_status(vconfig_cmd % (vlans[1], +vlans[1], +subnet2, +12)) != 0: +raise error.TestError, Fail to config VMs ip address +if session[0].get_command_status(ping -c 2 %s.12 % subnet2) == 0: +raise error.TestFail(Guest is unexpectedly pingable in different + vlan) + +if session[1].get_command_status(vconfig rem eth0.%s;vconfig add eth0 + %s;ifconfig eth0.%s %s.12 % + (vlans[1], + vlans[0], + vlans[0], + subnet2)) != 0: +raise error.TestError, Fail to config ip address of VM 'vm[1]' +if session[0].get_command_status(ping -c 2 %s.12 % subnet2) != 0: +raise error.TestFail, Fail to ping the guest in same vlan +finally: +for i in range(2): +session[i].sendline(vconfig rem eth0.%s % vlans[0]) +session[i].close() -- 1.5.5.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] [RESEND] KVM:VMX: Add support for Pause-Loop Exiting
Avi, This is the patch to enable PLE, which depends on the a small change of Linux scheduler (see http://lkml.org/lkml/2009/5/20/447). According to our discussion last time, one missing part is that if PLE exit, pick up an unscheduled vcpu at random and schedule it. But further investigation found that: 1. KVM is hard to know the schedule state for each vcpu. 2. Linux scheduler has no existed API can be used to pull a specific task to this cpu, so we need more changes to the common scheduler. So I prefer current simple way: just give up current cpu time. If no objection, I'll try to push common scheduler change first to linux. Thanks, edwin kvm_ple_v2.patch Description: Binary data
Re: [PATCH] [RESEND] KVM:VMX: Add support for Pause-Loop Exiting
On 09/23/2009 05:04 PM, Zhai, Edwin wrote: Avi, This is the patch to enable PLE, which depends on the a small change of Linux scheduler (see http://lkml.org/lkml/2009/5/20/447). According to our discussion last time, one missing part is that if PLE exit, pick up an unscheduled vcpu at random and schedule it. But further investigation found that: 1. KVM is hard to know the schedule state for each vcpu. 2. Linux scheduler has no existed API can be used to pull a specific task to this cpu, so we need more changes to the common scheduler. So I prefer current simple way: just give up current cpu time. If no objection, I'll try to push common scheduler change first to linux. We haven't sorted out what is the correct thing to do here. I think we should go for a directed yield, but until we have it, you can use hrtimers to sleep for 100 microseconds and hope the holding vcpu will get scheduled. Even if it doesn't, we're only wasting a few percent cpu time instead of spinning. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server
Avi Kivity wrote: On 09/22/2009 12:43 AM, Ira W. Snyder wrote: Sure, virtio-ira and he is on his own to make a bus-model under that, or virtio-vbus + vbus-ira-connector to use the vbus framework. Either model can work, I agree. Yes, I'm having to create my own bus model, a-la lguest, virtio-pci, and virtio-s390. It isn't especially easy. I can steal lots of code from the lguest bus model, but sometimes it is good to generalize, especially after the fourth implemention or so. I think this is what GHaskins tried to do. Yes. vbus is more finely layered so there is less code duplication. To clarify, Ira was correct in stating this generalizing some of these components was one of the goals for the vbus project: IOW vbus finely layers and defines what's below virtio, not replaces it. You can think of a virtio-stack like this: -- | virtio-net -- | virtio-ring -- | virtio-bus -- | ? undefined ? -- IOW: The way I see it, virtio is a device interface model only. The rest of it is filled in by the virtio-transport and some kind of back-end. So today, we can complete the ? undefined ? block like this for KVM: -- | virtio-pci -- | -- | kvm.ko -- | qemu -- | tuntap -- In this case, kvm.ko and tuntap are providing plumbing, and qemu is providing a backend device model (pci-based, etc). You can, of course, plug a different stack in (such as virtio-lguest, virtio-ira, etc) but you are more or less on your own to recreate many of the various facilities contained in that stack (such as things provided by QEMU, like discovery/hotswap/addressing), as Ira is discovering. Vbus tries to commoditize more components in the stack (like the bus model and backend-device model) so they don't need to be redesigned each time we solve this virtio-transport problem. IOW: stop the proliferation of the need for pci-bus, lguest-bus, foo-bus underneath virtio. Instead, we can then focus on the value add on top, like the models themselves or the simple glue between them. So now you might have something like -- | virtio-vbus -- | vbus-proxy -- | kvm-guest-connector -- | -- | kvm.ko -- | kvm-host-connector.ko -- | vbus.ko -- | virtio-net-backend.ko -- so now we don't need to worry about the bus-model or the device-model framework. We only need to implement the connector, etc. This is handy when you find yourself in an environment that doesn't support PCI (such as Ira's rig, or userspace containers), or when you want to add features that PCI doesn't have (such as fluid event channels for things like IPC services, or priortizable interrupts, etc). The virtio layering was more or less dictated by Xen which doesn't have shared memory (it uses grant references instead). As a matter of fact lguest, kvm/pci, and kvm/s390 all have shared memory, as you do, so that part is duplicated. It's probably possible to add a virtio-shmem.ko library that people who do have shared memory can reuse. Note that I do not believe the Xen folk use virtio, so while I can appreciate the foresight that went into that particular aspect of the design of the virtio model, I am not sure if its a realistic constraint. The reason why I decided to not worry about that particular model is twofold: 1) Trying to support non shared-memory designs is prohibitively high for my performance goals (for instance, requiring an exit on each -add_buf() in addition to the -kick()). 2) The Xen guys are unlikely to diverge from something like xenbus/xennet anyway, so it would be for naught. Therefore, I just went with a device model optimized for shared-memory outright. That said, I believe we can refactor what is called the vbus-proxy-device into this virtio-shmem interface that you and Anthony have described. We could make the feature optional and only support on architectures where this makes sense. snip Kind Regards, -Greg signature.asc Description: OpenPGP digital signature
Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server
On 09/23/2009 05:26 PM, Gregory Haskins wrote: Yes, I'm having to create my own bus model, a-la lguest, virtio-pci, and virtio-s390. It isn't especially easy. I can steal lots of code from the lguest bus model, but sometimes it is good to generalize, especially after the fourth implemention or so. I think this is what GHaskins tried to do. Yes. vbus is more finely layered so there is less code duplication. To clarify, Ira was correct in stating this generalizing some of these components was one of the goals for the vbus project: IOW vbus finely layers and defines what's below virtio, not replaces it. You can think of a virtio-stack like this: -- | virtio-net -- | virtio-ring -- | virtio-bus -- | ? undefined ? -- IOW: The way I see it, virtio is a device interface model only. The rest of it is filled in by the virtio-transport and some kind of back-end. So today, we can complete the ? undefined ? block like this for KVM: -- | virtio-pci -- | -- | kvm.ko -- | qemu -- | tuntap -- In this case, kvm.ko and tuntap are providing plumbing, and qemu is providing a backend device model (pci-based, etc). You can, of course, plug a different stack in (such as virtio-lguest, virtio-ira, etc) but you are more or less on your own to recreate many of the various facilities contained in that stack (such as things provided by QEMU, like discovery/hotswap/addressing), as Ira is discovering. Vbus tries to commoditize more components in the stack (like the bus model and backend-device model) so they don't need to be redesigned each time we solve this virtio-transport problem. IOW: stop the proliferation of the need for pci-bus, lguest-bus, foo-bus underneath virtio. Instead, we can then focus on the value add on top, like the models themselves or the simple glue between them. So now you might have something like -- | virtio-vbus -- | vbus-proxy -- | kvm-guest-connector -- | -- | kvm.ko -- | kvm-host-connector.ko -- | vbus.ko -- | virtio-net-backend.ko -- so now we don't need to worry about the bus-model or the device-model framework. We only need to implement the connector, etc. This is handy when you find yourself in an environment that doesn't support PCI (such as Ira's rig, or userspace containers), or when you want to add features that PCI doesn't have (such as fluid event channels for things like IPC services, or priortizable interrupts, etc). Well, vbus does more, for example it tunnels interrupts instead of exposing them 1:1 on the native interface if it exists. It also pulls parts of the device model into the host kernel. The virtio layering was more or less dictated by Xen which doesn't have shared memory (it uses grant references instead). As a matter of fact lguest, kvm/pci, and kvm/s390 all have shared memory, as you do, so that part is duplicated. It's probably possible to add a virtio-shmem.ko library that people who do have shared memory can reuse. Note that I do not believe the Xen folk use virtio, so while I can appreciate the foresight that went into that particular aspect of the design of the virtio model, I am not sure if its a realistic constraint. Since a virtio goal was to reduce virtual device driver proliferation, it was necessary to accommodate Xen. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server
Avi Kivity wrote: On 09/23/2009 05:26 PM, Gregory Haskins wrote: Yes, I'm having to create my own bus model, a-la lguest, virtio-pci, and virtio-s390. It isn't especially easy. I can steal lots of code from the lguest bus model, but sometimes it is good to generalize, especially after the fourth implemention or so. I think this is what GHaskins tried to do. Yes. vbus is more finely layered so there is less code duplication. To clarify, Ira was correct in stating this generalizing some of these components was one of the goals for the vbus project: IOW vbus finely layers and defines what's below virtio, not replaces it. You can think of a virtio-stack like this: -- | virtio-net -- | virtio-ring -- | virtio-bus -- | ? undefined ? -- IOW: The way I see it, virtio is a device interface model only. The rest of it is filled in by the virtio-transport and some kind of back-end. So today, we can complete the ? undefined ? block like this for KVM: -- | virtio-pci -- | -- | kvm.ko -- | qemu -- | tuntap -- In this case, kvm.ko and tuntap are providing plumbing, and qemu is providing a backend device model (pci-based, etc). You can, of course, plug a different stack in (such as virtio-lguest, virtio-ira, etc) but you are more or less on your own to recreate many of the various facilities contained in that stack (such as things provided by QEMU, like discovery/hotswap/addressing), as Ira is discovering. Vbus tries to commoditize more components in the stack (like the bus model and backend-device model) so they don't need to be redesigned each time we solve this virtio-transport problem. IOW: stop the proliferation of the need for pci-bus, lguest-bus, foo-bus underneath virtio. Instead, we can then focus on the value add on top, like the models themselves or the simple glue between them. So now you might have something like -- | virtio-vbus -- | vbus-proxy -- | kvm-guest-connector -- | -- | kvm.ko -- | kvm-host-connector.ko -- | vbus.ko -- | virtio-net-backend.ko -- so now we don't need to worry about the bus-model or the device-model framework. We only need to implement the connector, etc. This is handy when you find yourself in an environment that doesn't support PCI (such as Ira's rig, or userspace containers), or when you want to add features that PCI doesn't have (such as fluid event channels for things like IPC services, or priortizable interrupts, etc). Well, vbus does more, for example it tunnels interrupts instead of exposing them 1:1 on the native interface if it exists. As I've previously explained, that trait is a function of the kvm-connector I've chosen to implement, not of the overall design of vbus. The reason why my kvm-connector is designed that way is because my early testing/benchmarking shows one of the issues in KVM performance is the ratio of exits per IO operation are fairly high, especially as your scale io-load. Therefore, the connector achieves a substantial reduction in that ratio by treating interrupts to the same kind of benefits that NAPI brought to general networking: That is, we enqueue interrupt messages into a lockless ring and only hit the IDT for the first occurrence. Subsequent interrupts are injected in a parallel/lockless manner, without hitting the IDT nor incurring an extra EOI. This pays dividends as the IO rate increases, which is when the guest needs the most help. OTOH, it is entirely possible to design the connector such that we maintain a 1:1 ratio of signals to traditional IDT interrupts. It is also possible to design a connector which surfaces as something else, such as PCI devices (by terminating the connector in QEMU and utilizing its PCI emulation facilities), which would naturally employ 1:1 mapping. So if 1:1 mapping is a critical feature (I would argue to the contrary), vbus can support it. It also pulls parts of the device model into the host kernel. That is the point. Most of it needs to be there for performance. And what doesn't need to be there for performance can either be: a) skipped at the discretion of the connector/device-model designer OR b) included because its trivially small subset of the model (e.g. a mac-addr attribute) and its nice to have a cohesive solution instead of requiring a separate binary blob that can get out of sync, etc. The example Ive provided to date (venet on kvm) utilizes (b), but it certainly
Re: [PATCH] Don't call cpu_synchronize_state() in apic_init_reset()
Avi Kivity wrote: On 09/22/2009 05:51 PM, Gleb Natapov wrote: Each caller of the function already calls it. Signed-off-by: Gleb Natapovg...@redhat.com --- hw/apic.c |1 - 1 files changed, 0 insertions(+), 1 deletions(-) diff --git a/hw/apic.c b/hw/apic.c index 3a2e128..a9d1fb8 100644 --- a/hw/apic.c +++ b/hw/apic.c @@ -488,7 +488,6 @@ void apic_init_reset(CPUState *env) if (!s) return; -cpu_synchronize_state(env); s-tpr = 0; s-spurious_vec = 0xff; s-log_dest = 0; Still, it's safer to live this in. Yet another diff to upstream... It's really time to stabilize this still a bit fuzzy register sync model, also in qemu-kvm. If you think we need it, let us push it upstream - with a sound explanation, and maybe even with more sync points for the sake of consistency. Jan signature.asc Description: OpenPGP digital signature
Re: [PATCH] Don't call cpu_synchronize_state() in apic_init_reset()
On 09/23/2009 06:07 PM, Jan Kiszka wrote: Avi Kivity wrote: On 09/22/2009 05:51 PM, Gleb Natapov wrote: Each caller of the function already calls it. Signed-off-by: Gleb Natapovg...@redhat.com --- hw/apic.c |1 - 1 files changed, 0 insertions(+), 1 deletions(-) diff --git a/hw/apic.c b/hw/apic.c index 3a2e128..a9d1fb8 100644 --- a/hw/apic.c +++ b/hw/apic.c @@ -488,7 +488,6 @@ void apic_init_reset(CPUState *env) if (!s) return; -cpu_synchronize_state(env); s-tpr = 0; s-spurious_vec = 0xff; s-log_dest = 0; Still, it's safer to live this in. Yet another diff to upstream... It's really time to stabilize this still a bit fuzzy register sync model, also in qemu-kvm. If you think we need it, let us push it upstream - with a sound explanation, and maybe even with more sync points for the sake of consistency. Functions calling each other in the same subsystem can rely on callers calling cpu_synchronize_state(). Across subsystems, that's another matter, exported functions should try not to rely on implementation details of their callers. (You might argue that the apic is not separate subsystem wrt an x86 cpu, and I'm not sure I have a counterargument) -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Don't call cpu_synchronize_state() in apic_init_reset()
Avi Kivity wrote: On 09/23/2009 06:07 PM, Jan Kiszka wrote: Avi Kivity wrote: On 09/22/2009 05:51 PM, Gleb Natapov wrote: Each caller of the function already calls it. Signed-off-by: Gleb Natapovg...@redhat.com --- hw/apic.c |1 - 1 files changed, 0 insertions(+), 1 deletions(-) diff --git a/hw/apic.c b/hw/apic.c index 3a2e128..a9d1fb8 100644 --- a/hw/apic.c +++ b/hw/apic.c @@ -488,7 +488,6 @@ void apic_init_reset(CPUState *env) if (!s) return; -cpu_synchronize_state(env); s-tpr = 0; s-spurious_vec = 0xff; s-log_dest = 0; Still, it's safer to live this in. Yet another diff to upstream... It's really time to stabilize this still a bit fuzzy register sync model, also in qemu-kvm. If you think we need it, let us push it upstream - with a sound explanation, and maybe even with more sync points for the sake of consistency. Functions calling each other in the same subsystem can rely on callers calling cpu_synchronize_state(). Across subsystems, that's another matter, exported functions should try not to rely on implementation details of their callers. (You might argue that the apic is not separate subsystem wrt an x86 cpu, and I'm not sure I have a counterargument) I do accept this argument. It's just that my feeling is that we are lacking proper review of the required call sites of cpu_sychronize_state and rather put it where some regression popped up (and that only in qemu-kvm...). The new rule is: Synchronize the states before accessing registers (or in-kernel devices) the first time after a vmexit to user space. But, e.g., I do not see where we do this on CPU reset. Jan signature.asc Description: OpenPGP digital signature
sync guest calls made async on host - SQLite performance
Hi, I would like to call attention to the SQLite performance under KVM in the current Ubuntu Alpha. http://www.phoronix.com/scan.php?page=articleitem=linux_2631_kvmnum=3 SQLite's benchmark as part of the Phoronix Test Suite is typically IO limited and is affected by both disk and filesystem performance. When comparing SQLite under the host against the guest OS, there is an order of magnitude _IMPROVEMENT_ in the measured performance of the guest. I am expecting that the host is doing synchronous IO operations but somewhere in the stack the calls are ultimately being made asynchronous or at the very least batched for writing. On the surface, this represents a data integrity issue and I am interested in the KVM communities thoughts on this behaviour. Is it expected? Is it acceptable? Is it safe? Regards, Matthew -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server
Gregory Haskins wrote: Avi Kivity wrote: On 09/23/2009 05:26 PM, Gregory Haskins wrote: Yes, I'm having to create my own bus model, a-la lguest, virtio-pci, and virtio-s390. It isn't especially easy. I can steal lots of code from the lguest bus model, but sometimes it is good to generalize, especially after the fourth implemention or so. I think this is what GHaskins tried to do. Yes. vbus is more finely layered so there is less code duplication. To clarify, Ira was correct in stating this generalizing some of these components was one of the goals for the vbus project: IOW vbus finely layers and defines what's below virtio, not replaces it. You can think of a virtio-stack like this: -- | virtio-net -- | virtio-ring -- | virtio-bus -- | ? undefined ? -- IOW: The way I see it, virtio is a device interface model only. The rest of it is filled in by the virtio-transport and some kind of back-end. So today, we can complete the ? undefined ? block like this for KVM: -- | virtio-pci -- | -- | kvm.ko -- | qemu -- | tuntap -- In this case, kvm.ko and tuntap are providing plumbing, and qemu is providing a backend device model (pci-based, etc). You can, of course, plug a different stack in (such as virtio-lguest, virtio-ira, etc) but you are more or less on your own to recreate many of the various facilities contained in that stack (such as things provided by QEMU, like discovery/hotswap/addressing), as Ira is discovering. Vbus tries to commoditize more components in the stack (like the bus model and backend-device model) so they don't need to be redesigned each time we solve this virtio-transport problem. IOW: stop the proliferation of the need for pci-bus, lguest-bus, foo-bus underneath virtio. Instead, we can then focus on the value add on top, like the models themselves or the simple glue between them. So now you might have something like -- | virtio-vbus -- | vbus-proxy -- | kvm-guest-connector -- | -- | kvm.ko -- | kvm-host-connector.ko -- | vbus.ko -- | virtio-net-backend.ko -- so now we don't need to worry about the bus-model or the device-model framework. We only need to implement the connector, etc. This is handy when you find yourself in an environment that doesn't support PCI (such as Ira's rig, or userspace containers), or when you want to add features that PCI doesn't have (such as fluid event channels for things like IPC services, or priortizable interrupts, etc). Well, vbus does more, for example it tunnels interrupts instead of exposing them 1:1 on the native interface if it exists. As I've previously explained, that trait is a function of the kvm-connector I've chosen to implement, not of the overall design of vbus. The reason why my kvm-connector is designed that way is because my early testing/benchmarking shows one of the issues in KVM performance is the ratio of exits per IO operation are fairly high, especially as your scale io-load. Therefore, the connector achieves a substantial reduction in that ratio by treating interrupts to the same kind of benefits that NAPI brought to general networking: That is, we enqueue interrupt messages into a lockless ring and only hit the IDT for the first occurrence. Subsequent interrupts are injected in a parallel/lockless manner, without hitting the IDT nor incurring an extra EOI. This pays dividends as the IO rate increases, which is when the guest needs the most help. OTOH, it is entirely possible to design the connector such that we maintain a 1:1 ratio of signals to traditional IDT interrupts. It is also possible to design a connector which surfaces as something else, such as PCI devices (by terminating the connector in QEMU and utilizing its PCI emulation facilities), which would naturally employ 1:1 mapping. So if 1:1 mapping is a critical feature (I would argue to the contrary), vbus can support it. It also pulls parts of the device model into the host kernel. That is the point. Most of it needs to be there for performance. To clarify this point: There are various aspects about designing high-performance virtual devices such as providing the shortest paths possible between the physical resources and the consumers. Conversely, we also need to ensure that we meet proper isolation/protection guarantees at the same time. What this means is there are various aspects to any high-performance PV
[PATCH 0/3] ksm support for kvm v2
Hope i fixed everything i was asked... please tell me if I forgot anything. Izik Eidus (3): kvm: dont hold pagecount reference for mapped sptes pages add SPTE_HOST_WRITEABLE flag to the shadow ptes add support for change_pte mmu notifiers arch/x86/include/asm/kvm_host.h |1 + arch/x86/kvm/mmu.c | 86 ++ arch/x86/kvm/paging_tmpl.h | 18 +++- virt/kvm/kvm_main.c | 14 ++ 4 files changed, 98 insertions(+), 21 deletions(-) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/3] kvm: dont hold pagecount reference for mapped sptes pages
When using mmu notifiers, we are allowed to remove the page count reference tooken by get_user_pages to a specific page that is mapped inside the shadow page tables. This is needed so we can balance the pagecount against mapcount checking. (Right now kvm increase the pagecount and does not increase the mapcount when mapping page into shadow page table entry, so when comparing pagecount against mapcount, you have no reliable result.) Signed-off-by: Izik Eidus iei...@redhat.com --- arch/x86/kvm/mmu.c |7 ++- 1 files changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index eca41ae..6c67b23 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -634,9 +634,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) if (*spte shadow_accessed_mask) kvm_set_pfn_accessed(pfn); if (is_writeble_pte(*spte)) - kvm_release_pfn_dirty(pfn); - else - kvm_release_pfn_clean(pfn); + kvm_set_pfn_dirty(pfn); rmapp = gfn_to_rmap(kvm, sp-gfns[spte - sp-spt], sp-role.level); if (!*rmapp) { printk(KERN_ERR rmap_remove: %p %llx 0-BUG\n, spte, *spte); @@ -1877,8 +1875,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, page_header_update_slot(vcpu-kvm, sptep, gfn); if (!was_rmapped) { rmap_count = rmap_add(vcpu, sptep, gfn); - if (!is_rmap_spte(*sptep)) - kvm_release_pfn_clean(pfn); + kvm_release_pfn_clean(pfn); if (rmap_count RMAP_RECYCLE_THRESHOLD) rmap_recycle(vcpu, sptep, gfn); } else { -- 1.5.6.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/3] add SPTE_HOST_WRITEABLE flag to the shadow ptes
this flag notify that the host physical page we are pointing to from the spte is write protected, and therefore we cant change its access to be write unless we run get_user_pages(write = 1). (this is needed for change_pte support in kvm) Signed-off-by: Izik Eidus iei...@redhat.com --- arch/x86/kvm/mmu.c | 15 +++ arch/x86/kvm/paging_tmpl.h | 18 +++--- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6c67b23..5cd8b4e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -156,6 +156,8 @@ module_param(oos_shadow, bool, 0644); #define CREATE_TRACE_POINTS #include mmutrace.h +#define SPTE_HOST_WRITEABLE (1ULL PT_FIRST_AVAIL_BITS_SHIFT) + #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) struct kvm_rmap_desc { @@ -1754,7 +1756,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, int user_fault, int write_fault, int dirty, int level, gfn_t gfn, pfn_t pfn, bool speculative, - bool can_unsync) + bool can_unsync, bool reset_host_protection) { u64 spte; int ret = 0; @@ -1781,6 +1783,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= kvm_x86_ops-get_mt_mask(vcpu, gfn, kvm_is_mmio_pfn(pfn)); + if (reset_host_protection) + spte |= SPTE_HOST_WRITEABLE; + spte |= (u64)pfn PAGE_SHIFT; if ((pte_access ACC_WRITE_MASK) @@ -1826,7 +1831,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, int *ptwrite, int level, gfn_t gfn, -pfn_t pfn, bool speculative) +pfn_t pfn, bool speculative, +bool reset_host_protection) { int was_rmapped = 0; int was_writeble = is_writeble_pte(*sptep); @@ -1858,7 +1864,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, - dirty, level, gfn, pfn, speculative, true)) { + dirty, level, gfn, pfn, speculative, true, + reset_host_protection)) { if (write_fault) *ptwrite = 1; kvm_x86_ops-tlb_flush(vcpu); @@ -1906,7 +1913,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, if (iterator.level == level) { mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 0, write, 1, pt_write, -level, gfn, pfn, false); +level, gfn, pfn, false, true); ++vcpu-stat.pf_fixed; break; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index d2fec9c..cfd2424 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -273,9 +273,13 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, if (mmu_notifier_retry(vcpu, vcpu-arch.update_pte.mmu_seq)) return; kvm_get_pfn(pfn); + /* +* we call mmu_set_spte() with reset_host_protection = true beacuse that +* vcpu-arch.update_pte.pfn was fetched from get_user_pages(write = 1). +*/ mmu_set_spte(vcpu, spte, page-role.access, pte_access, 0, 0, gpte PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, -gpte_to_gfn(gpte), pfn, true); +gpte_to_gfn(gpte), pfn, true, true); } /* @@ -308,7 +312,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, user_fault, write_fault, gw-ptes[gw-level-1] PT_DIRTY_MASK, ptwrite, level, -gw-gfn, pfn, false); +gw-gfn, pfn, false, true); break; } @@ -558,6 +562,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { int i, offset, nr_present; + bool reset_host_protection; offset = nr_present = 0; @@ -595,9 +600,16 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) nr_present++; pte_access = sp-role.access FNAME(gpte_access)(vcpu, gpte); + if (!(sp-spt[i] SPTE_HOST_WRITEABLE)) { + pte_access = ~ACC_WRITE_MASK; + reset_host_protection = 0; + } else { +
[PATCH 3/3] add support for change_pte mmu notifiers
this is needed for kvm if it want ksm to directly map pages into its shadow page tables. Signed-off-by: Izik Eidus iei...@redhat.com --- arch/x86/include/asm/kvm_host.h |1 + arch/x86/kvm/mmu.c | 64 +- virt/kvm/kvm_main.c | 14 3 files changed, 70 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3be0004..d838922 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -796,6 +796,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void); #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_age_hva(struct kvm *kvm, unsigned long hva); +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5cd8b4e..0905ca2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -748,7 +748,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) return write_protected; } -static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data) { u64 *spte; int need_tlb_flush = 0; @@ -763,8 +763,47 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) return need_tlb_flush; } -static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, - int (*handler)(struct kvm *kvm, unsigned long *rmapp)) +static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data) +{ + int need_flush = 0; + u64 *spte, new_spte; + pte_t *ptep = (pte_t *)data; + pfn_t new_pfn; + + WARN_ON(pte_huge(*ptep)); + new_pfn = pte_pfn(*ptep); + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + BUG_ON(!is_shadow_present_pte(*spte)); + rmap_printk(kvm_set_pte_rmapp: spte %p %llx\n, spte, *spte); + need_flush = 1; + if (pte_write(*ptep)) { + rmap_remove(kvm, spte); + __set_spte(spte, shadow_trap_nonpresent_pte); + spte = rmap_next(kvm, rmapp, NULL); + } else { + new_spte = *spte ~ (PT64_BASE_ADDR_MASK); + new_spte |= new_pfn PAGE_SHIFT; + + if (!pte_write(*ptep)) { + new_spte = ~PT_WRITABLE_MASK; + new_spte = ~SPTE_HOST_WRITEABLE; + if (is_writeble_pte(*spte)) + kvm_set_pfn_dirty(spte_to_pfn(*spte)); + } + __set_spte(spte, new_spte); + spte = rmap_next(kvm, rmapp, spte); + } + } + if (need_flush) + kvm_flush_remote_tlbs(kvm); + + return 0; +} + +static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, u64 data, + int (*handler)(struct kvm *kvm, unsigned long *rmapp, +u64 data)) { int i, j; int retval = 0; @@ -786,13 +825,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, if (hva = start hva end) { gfn_t gfn_offset = (hva - start) PAGE_SHIFT; - retval |= handler(kvm, memslot-rmap[gfn_offset]); + retval |= handler(kvm, memslot-rmap[gfn_offset], + data); for (j = 0; j KVM_NR_PAGE_SIZES - 1; ++j) { int idx = gfn_offset; idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); retval |= handler(kvm, - memslot-lpage_info[j][idx].rmap_pde); + memslot-lpage_info[j][idx].rmap_pde, + data); } } } @@ -802,10 +843,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) { - return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); + return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); +} + +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + kvm_handle_hva(kvm, hva, (u64)pte, kvm_set_pte_rmapp); } -static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) +static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data) { u64 *spte; int young = 0; @@ -841,13 +887,13 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
Re: [PATCH 3/3] add support for change_pte mmu notifiers
Izik Eidus wrote: this is needed for kvm if it want ksm to directly map pages into its shadow page tables. Signed-off-by: Izik Eidus iei...@redhat.com --- arch/x86/include/asm/kvm_host.h |1 + arch/x86/kvm/mmu.c | 64 +- virt/kvm/kvm_main.c | 14 3 files changed, 70 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3be0004..d838922 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -796,6 +796,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void); #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_age_hva(struct kvm *kvm, unsigned long hva); +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5cd8b4e..0905ca2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -748,7 +748,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) return write_protected; } -static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data) { u64 *spte; int need_tlb_flush = 0; @@ -763,8 +763,47 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) return need_tlb_flush; } -static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, - int (*handler)(struct kvm *kvm, unsigned long *rmapp)) +static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data) +{ + int need_flush = 0; + u64 *spte, new_spte; + pte_t *ptep = (pte_t *)data; + pfn_t new_pfn; + + WARN_ON(pte_huge(*ptep)); + new_pfn = pte_pfn(*ptep); + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + BUG_ON(!is_shadow_present_pte(*spte)); + rmap_printk(kvm_set_pte_rmapp: spte %p %llx\n, spte, *spte); + need_flush = 1; + if (pte_write(*ptep)) { + rmap_remove(kvm, spte); + __set_spte(spte, shadow_trap_nonpresent_pte); + spte = rmap_next(kvm, rmapp, NULL); + } else { + new_spte = *spte ~ (PT64_BASE_ADDR_MASK); + new_spte |= new_pfn PAGE_SHIFT; + + if (!pte_write(*ptep)) { Just noticed that this if is not needed (we got to get here if we had if (pte_write(*ptep)) { few lines before...).. I will resend + new_spte = ~PT_WRITABLE_MASK; + new_spte = ~SPTE_HOST_WRITEABLE; + if (is_writeble_pte(*spte)) + kvm_set_pfn_dirty(spte_to_pfn(*spte)); + } + __set_spte(spte, new_spte); + spte = rmap_next(kvm, rmapp, spte); + } + } + if (need_flush) + kvm_flush_remote_tlbs(kvm); + -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/3] kvm ksm support v3
Change from v2 : remove unused if. Thanks. Izik Eidus (3): kvm: dont hold pagecount reference for mapped sptes pages add SPTE_HOST_WRITEABLE flag to the shadow ptes add support for change_pte mmu notifiers arch/x86/include/asm/kvm_host.h |1 + arch/x86/kvm/mmu.c | 84 ++ arch/x86/kvm/paging_tmpl.h | 18 +++- virt/kvm/kvm_main.c | 14 ++ 4 files changed, 96 insertions(+), 21 deletions(-) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/3] kvm: dont hold pagecount reference for mapped sptes pages
When using mmu notifiers, we are allowed to remove the page count reference tooken by get_user_pages to a specific page that is mapped inside the shadow page tables. This is needed so we can balance the pagecount against mapcount checking. (Right now kvm increase the pagecount and does not increase the mapcount when mapping page into shadow page table entry, so when comparing pagecount against mapcount, you have no reliable result.) Signed-off-by: Izik Eidus iei...@redhat.com --- arch/x86/kvm/mmu.c |7 ++- 1 files changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index eca41ae..6c67b23 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -634,9 +634,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) if (*spte shadow_accessed_mask) kvm_set_pfn_accessed(pfn); if (is_writeble_pte(*spte)) - kvm_release_pfn_dirty(pfn); - else - kvm_release_pfn_clean(pfn); + kvm_set_pfn_dirty(pfn); rmapp = gfn_to_rmap(kvm, sp-gfns[spte - sp-spt], sp-role.level); if (!*rmapp) { printk(KERN_ERR rmap_remove: %p %llx 0-BUG\n, spte, *spte); @@ -1877,8 +1875,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, page_header_update_slot(vcpu-kvm, sptep, gfn); if (!was_rmapped) { rmap_count = rmap_add(vcpu, sptep, gfn); - if (!is_rmap_spte(*sptep)) - kvm_release_pfn_clean(pfn); + kvm_release_pfn_clean(pfn); if (rmap_count RMAP_RECYCLE_THRESHOLD) rmap_recycle(vcpu, sptep, gfn); } else { -- 1.5.6.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/3] add SPTE_HOST_WRITEABLE flag to the shadow ptes
this flag notify that the host physical page we are pointing to from the spte is write protected, and therefore we cant change its access to be write unless we run get_user_pages(write = 1). (this is needed for change_pte support in kvm) Signed-off-by: Izik Eidus iei...@redhat.com --- arch/x86/kvm/mmu.c | 15 +++ arch/x86/kvm/paging_tmpl.h | 18 +++--- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6c67b23..5cd8b4e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -156,6 +156,8 @@ module_param(oos_shadow, bool, 0644); #define CREATE_TRACE_POINTS #include mmutrace.h +#define SPTE_HOST_WRITEABLE (1ULL PT_FIRST_AVAIL_BITS_SHIFT) + #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) struct kvm_rmap_desc { @@ -1754,7 +1756,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, int user_fault, int write_fault, int dirty, int level, gfn_t gfn, pfn_t pfn, bool speculative, - bool can_unsync) + bool can_unsync, bool reset_host_protection) { u64 spte; int ret = 0; @@ -1781,6 +1783,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= kvm_x86_ops-get_mt_mask(vcpu, gfn, kvm_is_mmio_pfn(pfn)); + if (reset_host_protection) + spte |= SPTE_HOST_WRITEABLE; + spte |= (u64)pfn PAGE_SHIFT; if ((pte_access ACC_WRITE_MASK) @@ -1826,7 +1831,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, int *ptwrite, int level, gfn_t gfn, -pfn_t pfn, bool speculative) +pfn_t pfn, bool speculative, +bool reset_host_protection) { int was_rmapped = 0; int was_writeble = is_writeble_pte(*sptep); @@ -1858,7 +1864,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, - dirty, level, gfn, pfn, speculative, true)) { + dirty, level, gfn, pfn, speculative, true, + reset_host_protection)) { if (write_fault) *ptwrite = 1; kvm_x86_ops-tlb_flush(vcpu); @@ -1906,7 +1913,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, if (iterator.level == level) { mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 0, write, 1, pt_write, -level, gfn, pfn, false); +level, gfn, pfn, false, true); ++vcpu-stat.pf_fixed; break; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index d2fec9c..cfd2424 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -273,9 +273,13 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, if (mmu_notifier_retry(vcpu, vcpu-arch.update_pte.mmu_seq)) return; kvm_get_pfn(pfn); + /* +* we call mmu_set_spte() with reset_host_protection = true beacuse that +* vcpu-arch.update_pte.pfn was fetched from get_user_pages(write = 1). +*/ mmu_set_spte(vcpu, spte, page-role.access, pte_access, 0, 0, gpte PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, -gpte_to_gfn(gpte), pfn, true); +gpte_to_gfn(gpte), pfn, true, true); } /* @@ -308,7 +312,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, user_fault, write_fault, gw-ptes[gw-level-1] PT_DIRTY_MASK, ptwrite, level, -gw-gfn, pfn, false); +gw-gfn, pfn, false, true); break; } @@ -558,6 +562,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { int i, offset, nr_present; + bool reset_host_protection; offset = nr_present = 0; @@ -595,9 +600,16 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) nr_present++; pte_access = sp-role.access FNAME(gpte_access)(vcpu, gpte); + if (!(sp-spt[i] SPTE_HOST_WRITEABLE)) { + pte_access = ~ACC_WRITE_MASK; + reset_host_protection = 0; + } else { +
[PATCH 3/3] add support for change_pte mmu notifiers
this is needed for kvm if it want ksm to directly map pages into its shadow page tables. Signed-off-by: Izik Eidus iei...@redhat.com --- arch/x86/include/asm/kvm_host.h |1 + arch/x86/kvm/mmu.c | 62 +- virt/kvm/kvm_main.c | 14 + 3 files changed, 68 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3be0004..d838922 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -796,6 +796,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void); #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_age_hva(struct kvm *kvm, unsigned long hva); +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5cd8b4e..ceec065 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -748,7 +748,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) return write_protected; } -static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data) { u64 *spte; int need_tlb_flush = 0; @@ -763,8 +763,45 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) return need_tlb_flush; } -static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, - int (*handler)(struct kvm *kvm, unsigned long *rmapp)) +static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data) +{ + int need_flush = 0; + u64 *spte, new_spte; + pte_t *ptep = (pte_t *)data; + pfn_t new_pfn; + + WARN_ON(pte_huge(*ptep)); + new_pfn = pte_pfn(*ptep); + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + BUG_ON(!is_shadow_present_pte(*spte)); + rmap_printk(kvm_set_pte_rmapp: spte %p %llx\n, spte, *spte); + need_flush = 1; + if (pte_write(*ptep)) { + rmap_remove(kvm, spte); + __set_spte(spte, shadow_trap_nonpresent_pte); + spte = rmap_next(kvm, rmapp, NULL); + } else { + new_spte = *spte ~ (PT64_BASE_ADDR_MASK); + new_spte |= new_pfn PAGE_SHIFT; + + new_spte = ~PT_WRITABLE_MASK; + new_spte = ~SPTE_HOST_WRITEABLE; + if (is_writeble_pte(*spte)) + kvm_set_pfn_dirty(spte_to_pfn(*spte)); + __set_spte(spte, new_spte); + spte = rmap_next(kvm, rmapp, spte); + } + } + if (need_flush) + kvm_flush_remote_tlbs(kvm); + + return 0; +} + +static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, u64 data, + int (*handler)(struct kvm *kvm, unsigned long *rmapp, +u64 data)) { int i, j; int retval = 0; @@ -786,13 +823,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, if (hva = start hva end) { gfn_t gfn_offset = (hva - start) PAGE_SHIFT; - retval |= handler(kvm, memslot-rmap[gfn_offset]); + retval |= handler(kvm, memslot-rmap[gfn_offset], + data); for (j = 0; j KVM_NR_PAGE_SIZES - 1; ++j) { int idx = gfn_offset; idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); retval |= handler(kvm, - memslot-lpage_info[j][idx].rmap_pde); + memslot-lpage_info[j][idx].rmap_pde, + data); } } } @@ -802,10 +841,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) { - return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); + return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); +} + +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + kvm_handle_hva(kvm, hva, (u64)pte, kvm_set_pte_rmapp); } -static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) +static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data) { u64 *spte; int young = 0; @@ -841,13 +885,13 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) gfn = unalias_gfn(vcpu-kvm, gfn); rmapp = gfn_to_rmap(vcpu-kvm, gfn, sp-role.level); -
Re: [PATCH] x86: Pick up local arch trace header
On Sun, Sep 20, 2009 at 3:49 PM, Aidan Marks ai...@cisco.com wrote: Ah, right. ok. hmm, still getting these symbol warnings: vger kvm-kmod # git checkout -b queue origin/queue M linux-2.6 Branch queue set up to track remote branch queue from origin. Switched to a new branch 'queue' vger kvm-kmod # git submodule update Submodule path 'linux-2.6': checked out 'abb015ac65852287c7a7c243c8cdee966a38854d' vger kvm-kmod # ./configure --kerneldir=/usr/src/linux vger kvm-kmod # make sync ./sync -v v2.6.31-rc3-4139-gabb015a -l ./linux-2.6 vger kvm-kmod # make make -C /usr/src/linux M=`pwd` \ LINUXINCLUDE=-I`pwd`/include -Iinclude \ -Iarch/x86/include -I`pwd`/include-compat \ -include include/linux/autoconf.h \ -include `pwd`/x86/external-module-compat.h \ $@ make[1]: Entering directory `/usr/src/linux-2.6.31-gentoo' CC [M] /tmp/kvm-kmod/x86/svm.o CC [M] /tmp/kvm-kmod/x86/vmx.o CC [M] /tmp/kvm-kmod/x86/vmx-debug.o CC [M] /tmp/kvm-kmod/x86/kvm_main.o CC [M] /tmp/kvm-kmod/x86/x86.o CC [M] /tmp/kvm-kmod/x86/mmu.o CC [M] /tmp/kvm-kmod/x86/emulate.o CC [M] /tmp/kvm-kmod/x86/../anon_inodes.o CC [M] /tmp/kvm-kmod/x86/irq.o CC [M] /tmp/kvm-kmod/x86/i8259.o CC [M] /tmp/kvm-kmod/x86/lapic.o CC [M] /tmp/kvm-kmod/x86/ioapic.o CC [M] /tmp/kvm-kmod/x86/preempt.o CC [M] /tmp/kvm-kmod/x86/i8254.o CC [M] /tmp/kvm-kmod/x86/coalesced_mmio.o CC [M] /tmp/kvm-kmod/x86/irq_comm.o CC [M] /tmp/kvm-kmod/x86/timer.o CC [M] /tmp/kvm-kmod/x86/eventfd.o CC [M] /tmp/kvm-kmod/x86/assigned-dev.o CC [M] /tmp/kvm-kmod/x86/../external-module-compat.o CC [M] /tmp/kvm-kmod/x86/../request-irq-compat.o CC [M] /tmp/kvm-kmod/x86/iommu.o CC [M] /tmp/kvm-kmod/x86/../srcu.o LD [M] /tmp/kvm-kmod/x86/kvm.o LD [M] /tmp/kvm-kmod/x86/kvm-intel.o LD [M] /tmp/kvm-kmod/x86/kvm-amd.o LD /tmp/kvm-kmod/built-in.o Building modules, stage 2. MODPOST 3 modules WARNING: vma_kernel_pagesize [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_mmu_paging_element [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_mmu_sync_page [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_mmio [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_mmu_set_accessed_bit [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_exit [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_pio [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_mmu_zap_page [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_pic_set_irq [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_cpuid [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_mmu_unsync_page [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_msr [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_set_irq [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_apic_accept_irq [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_inj_virq [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_hypercall [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_page_fault [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_entry [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_apic [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_ioapic_set_irq [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_apic_ipi [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_cr [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_ack_irq [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_mmu_walker_error [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_mmu_get_page [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_mmu_set_dirty_bit [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_mmu_pagetable_walk [/tmp/kvm-kmod/x86/kvm.ko] undefined! WARNING: __tracepoint_kvm_msi_set_irq [/tmp/kvm-kmod/x86/kvm.ko] undefined! CC /tmp/kvm-kmod/x86/kvm-amd.mod.o LD [M] /tmp/kvm-kmod/x86/kvm-amd.ko CC /tmp/kvm-kmod/x86/kvm-intel.mod.o LD [M] /tmp/kvm-kmod/x86/kvm-intel.ko CC /tmp/kvm-kmod/x86/kvm.mod.o LD [M] /tmp/kvm-kmod/x86/kvm.ko make[1]: Leaving directory `/usr/src/linux-2.6.31-gentoo' vger kvm-kmod # Aidan, were you able to solve this? I was having the same (original) problem in Xubuntu 64-bits with a custom 2.6.31 kernel and kvm-88. I still haven't tried Jan's patch (paper deadline at work) but I wanted to know if you had made any progress. Cheers, Jorge -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server
On 09/23/2009 08:58 PM, Gregory Haskins wrote: It also pulls parts of the device model into the host kernel. That is the point. Most of it needs to be there for performance. To clarify this point: There are various aspects about designing high-performance virtual devices such as providing the shortest paths possible between the physical resources and the consumers. Conversely, we also need to ensure that we meet proper isolation/protection guarantees at the same time. What this means is there are various aspects to any high-performance PV design that require to be placed in-kernel to maximize the performance yet properly isolate the guest. For instance, you are required to have your signal-path (interrupts and hypercalls), your memory-path (gpa translation), and addressing/isolation model in-kernel to maximize performance. Exactly. That's what vhost puts into the kernel and nothing more. Vbus accomplishes its in-kernel isolation model by providing a container concept, where objects are placed into this container by userspace. The host kernel enforces isolation/protection by using a namespace to identify objects that is only relevant within a specific container's context (namely, a u32 dev-id). The guest addresses the objects by its dev-id, and the kernel ensures that the guest can't access objects outside of its dev-id namespace. vhost manages to accomplish this without any kernel support. The guest simply has not access to any vhost resources other than the guest-host doorbell, which is handed to the guest outside vhost (so it's somebody else's problem, in userspace). All that is required is a way to transport a message with a devid attribute as an address (such as DEVCALL(devid)) and the framework provides the rest of the decode+execute function. vhost avoids that. Contrast this to vhost+virtio-pci (called simply vhost from here). It's the wrong name. vhost implements only the data path. It is not immune to requiring in-kernel addressing support either, but rather it just does it differently (and its not as you might expect via qemu). Vhost relies on QEMU to render PCI objects to the guest, which the guest assigns resources (such as BARs, interrupts, etc). vhost does not rely on qemu. It relies on its user to handle configuration. In one important case it's qemu+pci. It could just as well be the lguest launcher. A PCI-BAR in this example may represent a PIO address for triggering some operation in the device-model's fast-path. For it to have meaning in the fast-path, KVM has to have in-kernel knowledge of what a PIO-exit is, and what to do with it (this is where pio-bus and ioeventfd come in). The programming of the PIO-exit and the ioeventfd are likewise controlled by some userspace management entity (i.e. qemu). The PIO address and value tuple form the address, and the ioeventfd framework within KVM provide the decode+execute function. Right. This idea seemingly works fine, mind you, but it rides on top of a *lot* of stuff including but not limited to: the guests pci stack, the qemu pci emulation, kvm pio support, and ioeventfd. When you get into situations where you don't have PCI or even KVM underneath you (e.g. a userspace container, Ira's rig, etc) trying to recreate all of that PCI infrastructure for the sake of using PCI is, IMO, a lot of overhead for little gain. For the N+1th time, no. vhost is perfectly usable without pci. Can we stop raising and debunking this point? All you really need is a simple decode+execute mechanism, and a way to program it from userspace control. vbus tries to do just that: commoditize it so all you need is the transport of the control messages (like DEVCALL()), but the decode+execute itself is reuseable, even across various environments (like KVM or Iras rig). If you think it should be commodotized, write libvhostconfig.so. And your argument, I believe, is that vbus allows both to be implemented in the kernel (though to reiterate, its optional) and is therefore a bad design, so lets discuss that. I believe the assertion is that things like config-space are best left to userspace, and we should only relegate fast-path duties to the kernel. The problem is that, in my experience, a good deal of config-space actually influences the fast-path and thus needs to interact with the fast-path mechanism eventually anyway. Whats left over that doesn't fall into this category may cheaply ride on existing plumbing, so its not like we created something new or unnatural just to support this subclass of config-space. Flexibility is reduced, because changing code in the kernel is more expensive than in userspace, and kernel/user interfaces aren't typically as wide as pure userspace interfaces. Security is reduced, since a bug in the kernel affects the host, while a bug in userspace affects just on guest. Example: feature negotiation. If it happens in
Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server
Avi Kivity wrote: On 09/23/2009 08:58 PM, Gregory Haskins wrote: It also pulls parts of the device model into the host kernel. That is the point. Most of it needs to be there for performance. To clarify this point: There are various aspects about designing high-performance virtual devices such as providing the shortest paths possible between the physical resources and the consumers. Conversely, we also need to ensure that we meet proper isolation/protection guarantees at the same time. What this means is there are various aspects to any high-performance PV design that require to be placed in-kernel to maximize the performance yet properly isolate the guest. For instance, you are required to have your signal-path (interrupts and hypercalls), your memory-path (gpa translation), and addressing/isolation model in-kernel to maximize performance. Exactly. That's what vhost puts into the kernel and nothing more. Actually, no. Generally, _KVM_ puts those things into the kernel, and vhost consumes them. Without KVM (or something equivalent), vhost is incomplete. One of my goals with vbus is to generalize the something equivalent part here. I know you may not care about non-kvm use cases, and thats fine. No one says you have to. However, note that some of use do care about these non-kvm cases, and thus its a distinction I am making here as a benefit of the vbus framework. Vbus accomplishes its in-kernel isolation model by providing a container concept, where objects are placed into this container by userspace. The host kernel enforces isolation/protection by using a namespace to identify objects that is only relevant within a specific container's context (namely, a u32 dev-id). The guest addresses the objects by its dev-id, and the kernel ensures that the guest can't access objects outside of its dev-id namespace. vhost manages to accomplish this without any kernel support. No, vhost manages to accomplish this because of KVMs kernel support (ioeventfd, etc). Without a KVM-like in-kernel support, vhost is a merely a kind of tuntap-like clone signalled by eventfds. vbus on the other hand, generalizes one more piece of the puzzle (namely, the function of pio+ioeventfd and userspace's programming of it) by presenting the devid namespace and container concept. This goes directly to my rebuttal of your claim that vbus places too much in the kernel. I state that, one way or the other, address decode and isolation _must_ be in the kernel for performance. Vbus does this with a devid/container scheme. vhost+virtio-pci+kvm does it with pci+pio+ioeventfd. The guest simply has not access to any vhost resources other than the guest-host doorbell, which is handed to the guest outside vhost (so it's somebody else's problem, in userspace). You mean _controlled_ by userspace, right? Obviously, the other side of the kernel still needs to be programmed (ioeventfd, etc). Otherwise, vhost would be pointless: e.g. just use vanilla tuntap if you don't need fast in-kernel decoding. All that is required is a way to transport a message with a devid attribute as an address (such as DEVCALL(devid)) and the framework provides the rest of the decode+execute function. vhost avoids that. No, it doesn't avoid it. It just doesn't specify how its done, and relies on something else to do it on its behalf. Conversely, vbus specifies how its done, but not how to transport the verb across the wire. That is the role of the vbus-connector abstraction. Contrast this to vhost+virtio-pci (called simply vhost from here). It's the wrong name. vhost implements only the data path. Understood, but vhost+virtio-pci is what I am contrasting, and I use vhost for short from that point on because I am too lazy to type the whole name over and over ;) It is not immune to requiring in-kernel addressing support either, but rather it just does it differently (and its not as you might expect via qemu). Vhost relies on QEMU to render PCI objects to the guest, which the guest assigns resources (such as BARs, interrupts, etc). vhost does not rely on qemu. It relies on its user to handle configuration. In one important case it's qemu+pci. It could just as well be the lguest launcher. I meant vhost=vhost+virtio-pci here. Sorry for the confusion. The point I am making specifically is that vhost in general relies on other in-kernel components to function. I.e. It cannot function without having something like the PCI model to build an IO namespace. That namespace (in this case, pio addresses+data tuples) are used for the in-kernel addressing function under KVM + virtio-pci. The case of the lguest launcher is a good one to highlight. Yes, you can presumably also use lguest with vhost, if the requisite facilities are exposed to lguest-bus, and some eventfd based thing like ioeventfd is written for the host (if it doesnt exist already). And
[PATCH] kvm: allocate correct size for dirty bitmap
The dirty bitmap copied out to userspace is stored in a long array, and gets copied out to userspace accordingly. This patch accounts for that correctly. Currently I'm seeing kvm crashing due to writing beyond the end of the alloc'd dirty bitmap memory, because the buffer has the wrong size. Signed-off-by: Bruce Rogers --- qemu-kvm.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qemu-kvm.c b/qemu-kvm.c index 6511cb6..ee5db76 100644 --- a/qemu-kvm.c +++ b/qemu-kvm.c @@ -702,7 +702,7 @@ int kvm_get_dirty_pages_range(kvm_context_t kvm, unsigned long phys_addr, for (i = 0; i KVM_MAX_NUM_MEM_REGIONS; ++i) { if ((slots[i].len (uint64_t) slots[i].phys_addr = phys_addr) ((uint64_t) slots[i].phys_addr + slots[i].len = end_addr)) { -buf = qemu_malloc((slots[i].len / 4096 + 7) / 8 + 2); +buf = qemu_malloc(BITMAP_SIZE(slots[i].len)); r = kvm_get_map(kvm, KVM_GET_DIRTY_LOG, i, buf); if (r) { qemu_free(buf); -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH: kvm 1/6] Code motion. Separate timer intialization into an indepedent function.
Signed-off-by: Zachary Amsden zams...@redhat.com --- arch/x86/kvm/x86.c | 23 +++ 1 files changed, 15 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fedac9d..15d2ace 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3116,9 +3116,22 @@ static struct notifier_block kvmclock_cpufreq_notifier_block = { .notifier_call = kvmclock_cpufreq_notifier }; +static void kvm_timer_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(cpu_tsc_khz, cpu) = tsc_khz; + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + tsc_khz_ref = tsc_khz; + cpufreq_register_notifier(kvmclock_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); + } +} + int kvm_arch_init(void *opaque) { - int r, cpu; + int r; struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; if (kvm_x86_ops) { @@ -3150,13 +3163,7 @@ int kvm_arch_init(void *opaque) kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0); - for_each_possible_cpu(cpu) - per_cpu(cpu_tsc_khz, cpu) = tsc_khz; - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { - tsc_khz_ref = tsc_khz; - cpufreq_register_notifier(kvmclock_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - } + kvm_timer_init(); return 0; -- 1.6.4.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH: kvm 3/6] Fix hotadd of CPUs for KVM.
Both VMX and SVM require per-cpu memory allocation, which is done at module init time, for only online cpus. When bringing a new CPU online, we must also allocate this structure. The method chosen to implement this is to make the CPU online notifier available via a call to the arch code. This allows memory allocation to be done smoothly, without any need to allocate extra structures. Note: CPU up notifiers may call KVM callback before calling cpufreq callbacks. This would causes the CPU frequency not to be detected (and it is not always clear on non-constant TSC platforms what the bringup TSC rate will be, so the guess of using tsc_khz could be wrong). So, we clear the rate to zero in such a case and add logic to query it upon entry. Signed-off-by: Zachary Amsden zams...@redhat.com --- arch/x86/include/asm/kvm_host.h |2 ++ arch/x86/kvm/svm.c | 15 +-- arch/x86/kvm/vmx.c | 17 + arch/x86/kvm/x86.c | 14 +- include/linux/kvm_host.h|6 ++ virt/kvm/kvm_main.c |3 ++- 6 files changed, 53 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 299cc1b..b7dd14b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -459,6 +459,7 @@ struct descriptor_table { struct kvm_x86_ops { int (*cpu_has_kvm_support)(void); /* __init */ int (*disabled_by_bios)(void); /* __init */ + int (*cpu_hotadd)(int cpu); int (*hardware_enable)(void *dummy); void (*hardware_disable)(void *dummy); void (*check_processor_compatibility)(void *rtn); @@ -791,6 +792,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void); _ASM_PTR 666b, 667b \n\t \ .popsection +#define KVM_ARCH_WANT_HOTPLUG_NOTIFIER #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_age_hva(struct kvm *kvm, unsigned long hva); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9a4daca..8f99d0c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -330,13 +330,13 @@ static int svm_hardware_enable(void *garbage) return -EBUSY; if (!has_svm()) { - printk(KERN_ERR svm_cpu_init: err EOPNOTSUPP on %d\n, me); + printk(KERN_ERR svm_hardware_enable: err EOPNOTSUPP on %d\n, me); return -EINVAL; } svm_data = per_cpu(svm_data, me); if (!svm_data) { - printk(KERN_ERR svm_cpu_init: svm_data is NULL on %d\n, + printk(KERN_ERR svm_hardware_enable: svm_data is NULL on %d\n, me); return -EINVAL; } @@ -394,6 +394,16 @@ err_1: } +static __cpuinit int svm_cpu_hotadd(int cpu) +{ + struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); + + if (svm_data) + return 0; + + return svm_cpu_init(cpu); +} + static void set_msr_interception(u32 *msrpm, unsigned msr, int read, int write) { @@ -2858,6 +2868,7 @@ static struct kvm_x86_ops svm_x86_ops = { .hardware_setup = svm_hardware_setup, .hardware_unsetup = svm_hardware_unsetup, .check_processor_compatibility = svm_check_processor_compat, + .cpu_hotadd = svm_cpu_hotadd, .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3fe0d42..b8a8428 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1408,6 +1408,22 @@ static __exit void hardware_unsetup(void) free_kvm_area(); } +static __cpuinit int vmx_cpu_hotadd(int cpu) +{ + struct vmcs *vmcs; + + if (per_cpu(vmxarea, cpu)) + return 0; + + vmcs = alloc_vmcs_cpu(cpu); + if (!vmcs) + return -ENOMEM; + + per_cpu(vmxarea, cpu) = vmcs; + + return 0; +} + static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) { struct kvm_vmx_segment_field *sf = kvm_vmx_segment_fields[seg]; @@ -3925,6 +3941,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .hardware_setup = hardware_setup, .hardware_unsetup = hardware_unsetup, .check_processor_compatibility = vmx_check_processor_compat, + .cpu_hotadd = vmx_cpu_hotadd, .hardware_enable = hardware_enable, .hardware_disable = hardware_disable, .cpu_has_accelerated_tpr = report_flexpriority, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 35082dd..05aea42 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1339,6 +1339,8 @@ out: void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { kvm_x86_ops-vcpu_load(vcpu, cpu); + if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) +
[PATCH: kvm 4/6] Fix hotremove of CPUs for KVM.
In the process of bringing down CPUs, the SVM / VMX structures associated with those CPUs are not freed. This may cause leaks when unloading and reloading the KVM module, as only the structures associated with online CPUs are cleaned up. Signed-off-by: Zachary Amsden zams...@redhat.com --- arch/x86/include/asm/kvm_host.h |1 + arch/x86/kvm/svm.c |1 + arch/x86/kvm/vmx.c |7 +++ arch/x86/kvm/x86.c |5 + include/linux/kvm_host.h|2 ++ virt/kvm/kvm_main.c |2 ++ 6 files changed, 18 insertions(+), 0 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b7dd14b..91e02d3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -460,6 +460,7 @@ struct kvm_x86_ops { int (*cpu_has_kvm_support)(void); /* __init */ int (*disabled_by_bios)(void); /* __init */ int (*cpu_hotadd)(int cpu); + void (*cpu_hotremove)(int cpu); int (*hardware_enable)(void *dummy); void (*hardware_disable)(void *dummy); void (*check_processor_compatibility)(void *rtn); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8f99d0c..7cf6c98 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2869,6 +2869,7 @@ static struct kvm_x86_ops svm_x86_ops = { .hardware_unsetup = svm_hardware_unsetup, .check_processor_compatibility = svm_check_processor_compat, .cpu_hotadd = svm_cpu_hotadd, + .cpu_hotremove = svm_cpu_uninit, .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b8a8428..1e3e9fc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1424,6 +1424,12 @@ static __cpuinit int vmx_cpu_hotadd(int cpu) return 0; } +static __cpuinit void vmx_cpu_hotremove(int cpu) +{ + free_vmcs(per_cpu(vmxarea, cpu)); + per_cpu(vmxarea, cpu) = NULL; +} + static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) { struct kvm_vmx_segment_field *sf = kvm_vmx_segment_fields[seg]; @@ -3942,6 +3948,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .hardware_unsetup = hardware_unsetup, .check_processor_compatibility = vmx_check_processor_compat, .cpu_hotadd = vmx_cpu_hotadd, + .cpu_hotremove = vmx_cpu_hotremove, .hardware_enable = hardware_enable, .hardware_disable = hardware_disable, .cpu_has_accelerated_tpr = report_flexpriority, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 05aea42..38ba4a6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4719,6 +4719,11 @@ int kvm_arch_cpu_hotadd(int cpu) return kvm_x86_ops-cpu_hotadd(cpu); } +void kvm_arch_cpu_hotremove(int cpu) +{ + kvm_x86_ops-cpu_hotremove(cpu); +} + int kvm_arch_hardware_enable(void *garbage) { /* diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2f075c4..2c844f0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -347,8 +347,10 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); #ifdef KVM_ARCH_WANT_HOTPLUG_NOTIFIER int kvm_arch_cpu_hotadd(int cpu); +void kvm_arch_cpu_hotremove(int cpu); #else #define kvm_arch_cpu_hotadd(x) (0) +#define kvm_arch_cpu_hotremove(x) #endif int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1360db4..bd21927 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1725,11 +1725,13 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, printk(KERN_INFO kvm: disabling virtualization on CPU%d\n, cpu); hardware_disable(NULL); + kvm_arch_cpu_hotremove(cpu); break; case CPU_UP_CANCELED: printk(KERN_INFO kvm: disabling virtualization on CPU%d\n, cpu); smp_call_function_single(cpu, hardware_disable, NULL, 1); + kvm_arch_cpu_hotremove(cpu); break; case CPU_ONLINE: printk(KERN_INFO kvm: enabling virtualization on CPU%d\n, -- 1.6.4.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH: kvm 5/6] Don't unconditionally clear cpu_khz_tsc in hardware_enable.
For the non-hotplug case, this TSC speed should be available; instead, clear cpu_khz_tsc when bringing down a CPU so it is recomputed at the next bringup. Signed-off-by: Zachary Amsden zams...@redhat.com --- arch/x86/kvm/x86.c | 12 +--- 1 files changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 38ba4a6..f1470ce 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4721,18 +4721,16 @@ int kvm_arch_cpu_hotadd(int cpu) void kvm_arch_cpu_hotremove(int cpu) { + /* +* Reset TSC khz to zero so it is recomputed on bringup +*/ + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + per_cpu(cpu_tsc_khz, cpu) = 0; kvm_x86_ops-cpu_hotremove(cpu); } int kvm_arch_hardware_enable(void *garbage) { - /* -* Notifier callback chain may not have called cpufreq code -* yet, thus we must reset TSC khz to zero and recompute it -* before entering. -*/ - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) - per_cpu(cpu_tsc_khz, raw_smp_processor_id()) = 0; return kvm_x86_ops-hardware_enable(garbage); } -- 1.6.4.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH: kvm 6/6] Math is hard; let's do some cooking.
CPU frequency change callback provides new TSC frequency for us, and in the same units (kHz), so there is no reason to do any math. Signed-off-by: Zachary Amsden zams...@redhat.com --- arch/x86/kvm/x86.c |5 + 1 files changed, 1 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f1470ce..c849f8f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3083,15 +3083,12 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va struct kvm *kvm; struct kvm_vcpu *vcpu; int i, send_ipi = 0; - unsigned long old_khz; if (val == CPUFREQ_PRECHANGE freq-old freq-new) return 0; if (val == CPUFREQ_POSTCHANGE freq-old freq-new) return 0; - old_khz = per_cpu(cpu_tsc_khz, freq-cpu); - per_cpu(cpu_tsc_khz, freq-cpu) = cpufreq_scale(old_khz, freq-old, - freq-new); + per_cpu(cpu_tsc_khz, freq-cpu) = freq-new; spin_lock(kvm_lock); list_for_each_entry(kvm, vm_list, vm_list) { -- 1.6.4.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH: kvm 2/6] Kill the confusing tsc_ref_khz and ref_freq variables.
They are globals, not clearly protected by any ordering or locking, and vulnerable to various startup races. Instead, for variable TSC machines, register the cpufreq notifier and get the TSC frequency directly from the cpufreq machinery. Not only is it always right, it is also perfectly accurate, as no error prone measurement is required. On such machines, also detect the frequency when bringing a new CPU online; it isn't clear what frequency it will start with, and it may not correspond to the reference. Signed-off-by: Zachary Amsden zams...@redhat.com --- arch/x86/kvm/x86.c | 38 -- 1 files changed, 28 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 15d2ace..35082dd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -650,6 +650,19 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info * static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); +static inline void kvm_get_cpu_khz(int cpu) +{ + unsigned int khz = cpufreq_get(cpu); + if (khz = 0) { + static int warn = 0; + if (warn++ 10) + printk(KERN_ERR kvm: could not get frequency for CPU + %d. Timers may be inaccurate\n, cpu); + khz = tsc_khz; + } + per_cpu(cpu_tsc_khz, cpu) = khz; +} + static void kvm_write_guest_time(struct kvm_vcpu *v) { struct timespec ts; @@ -3061,9 +3074,6 @@ static void bounce_off(void *info) /* nothing */ } -static unsigned int ref_freq; -static unsigned long tsc_khz_ref; - static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { @@ -3071,15 +3081,15 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va struct kvm *kvm; struct kvm_vcpu *vcpu; int i, send_ipi = 0; - - if (!ref_freq) - ref_freq = freq-old; + unsigned long old_khz; if (val == CPUFREQ_PRECHANGE freq-old freq-new) return 0; if (val == CPUFREQ_POSTCHANGE freq-old freq-new) return 0; - per_cpu(cpu_tsc_khz, freq-cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq-new); + old_khz = per_cpu(cpu_tsc_khz, freq-cpu); + per_cpu(cpu_tsc_khz, freq-cpu) = cpufreq_scale(old_khz, freq-old, + freq-new); spin_lock(kvm_lock); list_for_each_entry(kvm, vm_list, vm_list) { @@ -3120,12 +3130,18 @@ static void kvm_timer_init(void) { int cpu; - for_each_possible_cpu(cpu) - per_cpu(cpu_tsc_khz, cpu) = tsc_khz; if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { - tsc_khz_ref = tsc_khz; cpufreq_register_notifier(kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); + for_each_online_cpu(cpu) + kvm_get_cpu_khz(cpu); + } else { + for_each_possible_cpu(cpu) + per_cpu(cpu_tsc_khz, cpu) = tsc_khz; + } + for_each_possible_cpu(cpu) { + printk(KERN_DEBUG kvm: cpu %d = %ld khz\n, + cpu, per_cpu(cpu_tsc_khz, cpu)); } } @@ -4698,6 +4714,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) int kvm_arch_hardware_enable(void *garbage) { + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + kvm_get_cpu_khz(raw_smp_processor_id()); return kvm_x86_ops-hardware_enable(garbage); } -- 1.6.4.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
linux-next: tree build failure
Hi all, Today's linux-next build (powerpc ppc44x_defconfig) failed like this: In file included from arch/powerpc/kvm/booke.c:31: arch/powerpc/kvm/timing.h: In function 'kvmppc_account_exit_stat': arch/powerpc/kvm/timing.h:51: error: bit-field 'anonymous' width not an integer constant In file included from arch/powerpc/kvm/booke.h:26, from arch/powerpc/kvm/booke_emulate.c:23: arch/powerpc/kvm/timing.h: In function 'kvmppc_account_exit_stat': arch/powerpc/kvm/timing.h:51: error: bit-field 'anonymous' width not an integer constant Presumably caused by commit 8c87df457cb58fe75b9b893007917cf8095660a0 (BUILD_BUG_ON(): fix it and a couple of bogus uses of it). I applied the following patch for today. This inline function is only called from one place in one file ... From: Stephen Rothwell s...@canb.auug.org.au Date: Thu, 24 Sep 2009 15:13:20 +1000 Subject: [PATCH] powerpc/kvm: build fix for new BUILD_BUG_ON Signed-off-by: Stephen Rothwell s...@canb.auug.org.au --- arch/powerpc/kvm/timing.h |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/arch/powerpc/kvm/timing.h b/arch/powerpc/kvm/timing.h index bb13b1f..4c34099 100644 --- a/arch/powerpc/kvm/timing.h +++ b/arch/powerpc/kvm/timing.h @@ -48,7 +48,7 @@ static inline void kvmppc_set_exit_type(struct kvm_vcpu *vcpu, int type) {} static inline void kvmppc_account_exit_stat(struct kvm_vcpu *vcpu, int type) { /* type has to be known at build time for optimization */ - BUILD_BUG_ON(__builtin_constant_p(type)); + //BUILD_BUG_ON(__builtin_constant_p(type)); switch (type) { case EXT_INTR_EXITS: vcpu-stat.ext_intr_exits++; -- 1.6.4.3 -- Cheers, Stephen Rothwells...@canb.auug.org.au -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html