[PATCH] viostor driver. switch to full-duplex mode.

2009-09-23 Thread Vadim Rozenfeld


repository: /home/vadimr/shares/kvm-guest-drivers-windows
branch: master
commit ed4b9ade27b56e9ee37461c2cf72e46d75633e9c
Author: Vadim Rozenfeldvroze...@redhat.com
Date:   Wed Sep 23 11:28:48 2009 +0300

[PATCH] viostor driver. switch to full-duplex mode.

Signed-off-by: Vadim Rozenfeldvroze...@redhat.com

diff --git a/viostor/virtio_stor.c b/viostor/virtio_stor.c
index e4acaa0..297949a 100644
--- a/viostor/virtio_stor.c
+++ b/viostor/virtio_stor.c
@@ -194,6 +194,7 @@ VirtIoFindAdapter(
 ConfigInfo-WmiDataProvider= FALSE;
 #ifdef USE_STORPORT
 ConfigInfo-MapBuffers = STOR_MAP_NON_READ_WRITE_BUFFERS;
+ConfigInfo-SynchronizationModel   = StorSynchronizeFullDuplex;
 #else
 ConfigInfo-MapBuffers = TRUE;
 #endif
@@ -323,6 +324,8 @@ VirtIoFindAdapter(
 return SP_RETURN_ERROR;
 }

+InitializeListHead(adaptExt-list_head);
+
 return SP_RETURN_FOUND;
 }

@@ -488,7 +491,7 @@ VirtIoStartIo(
 case SCSIOP_WRITE: {
 Srb-SrbStatus = SRB_STATUS_PENDING;
 if(!RhelDoReadWrite(DeviceExtension, Srb)) {
-Srb-SrbStatus = SRB_STATUS_ABORTED;
+Srb-SrbStatus = SRB_STATUS_BUSY;
 CompleteSRB(DeviceExtension, Srb);
 }
 return TRUE;
@@ -561,7 +564,7 @@ VirtIoInterrupt(
 Srb-SrbStatus = SRB_STATUS_ERROR;
 break;
}
-
+   RemoveEntryList(vbr-list_entry);
CompleteSRB(DeviceExtension, Srb);
 }
 }
diff --git a/viostor/virtio_stor.h b/viostor/virtio_stor.h
index 1c0dbb6..2d98738 100644
--- a/viostor/virtio_stor.h
+++ b/viostor/virtio_stor.h
@@ -78,12 +78,8 @@ typedef struct virtio_blk_outhdr {
 u64 sector;
 }blk_outhdr, *pblk_outhdr;

-struct list_head {
-struct list_head *next, *prev;
-};
-
 typedef struct virtio_blk_req {
-struct list_head list;
+LIST_ENTRY list_entry;
 struct request *req;
 blk_outhdr out_hdr;
 u8 status;
@@ -98,6 +94,7 @@ typedef struct _ADAPTER_EXTENSION {
 blk_configinfo;
 ULONG queue_depth;
 BOOLEAN   dump_mode;
+LIST_ENTRYlist_head;
 }ADAPTER_EXTENSION, *PADAPTER_EXTENSION;

 typedef struct _RHEL_SRB_EXTENSION {
diff --git a/viostor/virtio_stor_hw_helper.c b/viostor/virtio_stor_hw_helper.c
index 43bde5a..3c09259 100644
--- a/viostor/virtio_stor_hw_helper.c
+++ b/viostor/virtio_stor_hw_helper.c
@@ -15,22 +15,33 @@
 #include virtio_stor_hw_helper.h

 #ifdef USE_STORPORT
-BOOLEAN
-RhelDoReadWrite(PVOID DeviceExtension,
-PSCSI_REQUEST_BLOCK Srb)
+BOOLEAN
+SynchronizedAccessRoutine(
+IN PVOID DeviceExtension,
+IN PVOID Context
+)
 {
 PADAPTER_EXTENSION  adaptExt = (PADAPTER_EXTENSION)DeviceExtension;
+PSCSI_REQUEST_BLOCK Srb  = (PSCSI_REQUEST_BLOCK) Context;
 PRHEL_SRB_EXTENSION srbExt   = (PRHEL_SRB_EXTENSION)Srb-SrbExtension;
-
 if (adaptExt-pci_vq_info.vq-vq_ops-add_buf(adaptExt-pci_vq_info.vq,
-srbExt-vbr.sg[0],
-  srbExt-out, srbExt-in,
-srbExt-vbr) == 0) {
+srbExt-vbr.sg[0],
+ srbExt-out, srbExt-in,
+srbExt-vbr) == 0){
+InsertTailList(adaptExt-list_head,srbExt-vbr.list_entry);
 adaptExt-pci_vq_info.vq-vq_ops-kick(adaptExt-pci_vq_info.vq);
 return TRUE;
 }
+StorPortBusy(DeviceExtension, 10);
 return FALSE;
 }
+
+BOOLEAN
+RhelDoReadWrite(PVOID DeviceExtension,
+PSCSI_REQUEST_BLOCK Srb)
+{
+return 
StorPortSynchronizeAccess(DeviceExtension,SynchronizedAccessRoutine, 
(PVOID)Srb);
+}
 #else
 BOOLEAN
 RhelDoReadWrite(PVOID DeviceExtension,
@@ -83,6 +94,8 @@ RhelDoReadWrite(PVOID DeviceExtension,
   srbExt-vbr.sg[0],
   srbExt-out, srbExt-in,
   srbExt-vbr) == 0) {
+//FIXME
+InsertTailList(adaptExt-list_head,srbExt-vbr.list_entry);
 adaptExt-pci_vq_info.vq-vq_ops-kick(adaptExt-pci_vq_info.vq);
 return TRUE;
 }

--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/5] Don't call svm_complete_interrupts for nested guests

2009-09-23 Thread Alexander Graf


Am 22.09.2009 um 18:26 schrieb Joerg Roedel j...@8bytes.org:


On Fri, Sep 18, 2009 at 03:00:29PM +0200, Alexander Graf wrote:
SVM has some cleanup code, that tries to reinject interrupts and  
exceptions
when the guest didn't manage to deal with them yet. It basically  
transfers

them to KVM internal state.

Unfortunately, the internal state is reserved for the L1 guest  
state, so we

shouldn't try to go through that logic when running a nested guest.

When doing something the host KVM can handle, let's just reinject  
the event

into the L2 guest, because we didn't touch its state anyways.


I don't really understandt what problem this patch addresses. There  
are
situations where we have events to reinject into the l2 guest  
directly.

But the generic reinjection code works fine for it.
The only problematic thing with it is that it implicitly relies on
exit_int_info not to be changed in the exit cycle (which would be  
worth

a comment).


It si



   Joerg



Signed-off-by: Alexander Graf ag...@suse.de
---
arch/x86/kvm/svm.c |   18 ++
1 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f12a669..61efd13 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2349,7 +2349,10 @@ static int handle_exit(struct kvm_vcpu *vcpu)
   trace_kvm_exit(exit_code, svm-vmcb-save.rip);

   if (is_nested(svm)) {
+struct vmcb_control_area *control = svm-vmcb-control;
   int vmexit;
+int type;
+int vec;

   nsvm_printk(nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx 
\n,

   exit_code, svm-vmcb-control.exit_info_1,
@@ -2362,9 +2365,18 @@ static int handle_exit(struct kvm_vcpu *vcpu)

   if (vmexit == NESTED_EXIT_DONE)
   return 1;
-}

-svm_complete_interrupts(svm);
+type = control-exit_int_info  SVM_EXITINTINFO_TYPE_MASK;
+vec = control-exit_int_info  SVM_EXITINTINFO_VEC_MASK;
+if ((type == SVM_EXITINTINFO_TYPE_INTR) ||
+((type == SVM_EXITINTINFO_TYPE_EXEPT)  ! 
kvm_exception_is_soft(vec))) {

+control-event_inj = control-exit_int_info;
+control-event_inj_err = control-exit_int_info_err;
+}
+} else {
+/* Don't interpret exit_info for nested guests */
+svm_complete_interrupts(svm);
+}

   if (npt_enabled) {
   int mmu_reload = 0;
@@ -2602,8 +2614,6 @@ static void svm_complete_interrupts(struct  
vcpu_svm *svm)

   case SVM_EXITINTINFO_TYPE_EXEPT:
   /* In case of software exception do not reinject an exception
  vector, but re-execute and instruction instead */
-if (is_nested(svm))
-break;
   if (kvm_exception_is_soft(vector))
   break;
   if (exitintinfo  SVM_EXITINTINFO_VALID_ERR) {
--
1.6.0.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/5] Don't call svm_complete_interrupts for nested guests

2009-09-23 Thread Alexander Graf


Am 22.09.2009 um 18:26 schrieb Joerg Roedel j...@8bytes.org:


On Fri, Sep 18, 2009 at 03:00:29PM +0200, Alexander Graf wrote:
SVM has some cleanup code, that tries to reinject interrupts and  
exceptions
when the guest didn't manage to deal with them yet. It basically  
transfers

them to KVM internal state.

Unfortunately, the internal state is reserved for the L1 guest  
state, so we

shouldn't try to go through that logic when running a nested guest.

When doing something the host KVM can handle, let's just reinject  
the event

into the L2 guest, because we didn't touch its state anyways.


I don't really understandt what problem this patch addresses. There  
are
situations where we have events to reinject into the l2 guest  
directly.

But the generic reinjection code works fine for it.
The only problematic thing with it is that it implicitly relies on
exit_int_info not to be changed in the exit cycle (which would be  
worth

a comment).


It simply tries to be too clever. Reevaluating exceptions won't work  
for example.


Alex




   Joerg



Signed-off-by: Alexander Graf ag...@suse.de
---
arch/x86/kvm/svm.c |   18 ++
1 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f12a669..61efd13 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2349,7 +2349,10 @@ static int handle_exit(struct kvm_vcpu *vcpu)
   trace_kvm_exit(exit_code, svm-vmcb-save.rip);

   if (is_nested(svm)) {
+struct vmcb_control_area *control = svm-vmcb-control;
   int vmexit;
+int type;
+int vec;

   nsvm_printk(nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx 
\n,

   exit_code, svm-vmcb-control.exit_info_1,
@@ -2362,9 +2365,18 @@ static int handle_exit(struct kvm_vcpu *vcpu)

   if (vmexit == NESTED_EXIT_DONE)
   return 1;
-}

-svm_complete_interrupts(svm);
+type = control-exit_int_info  SVM_EXITINTINFO_TYPE_MASK;
+vec = control-exit_int_info  SVM_EXITINTINFO_VEC_MASK;
+if ((type == SVM_EXITINTINFO_TYPE_INTR) ||
+((type == SVM_EXITINTINFO_TYPE_EXEPT)  ! 
kvm_exception_is_soft(vec))) {

+control-event_inj = control-exit_int_info;
+control-event_inj_err = control-exit_int_info_err;
+}
+} else {
+/* Don't interpret exit_info for nested guests */
+svm_complete_interrupts(svm);
+}

   if (npt_enabled) {
   int mmu_reload = 0;
@@ -2602,8 +2614,6 @@ static void svm_complete_interrupts(struct  
vcpu_svm *svm)

   case SVM_EXITINTINFO_TYPE_EXEPT:
   /* In case of software exception do not reinject an exception
  vector, but re-execute and instruction instead */
-if (is_nested(svm))
-break;
   if (kvm_exception_is_soft(vector))
   break;
   if (exitintinfo  SVM_EXITINTINFO_VALID_ERR) {
--
1.6.0.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/5] Don't #VMEXIT(INTR) if we still have event_inj waiting

2009-09-23 Thread Alexander Graf


Am 22.09.2009 um 18:39 schrieb Joerg Roedel j...@8bytes.org:


On Fri, Sep 18, 2009 at 03:00:30PM +0200, Alexander Graf wrote:
Real hardware would first process the event_inj field and then  
notify the

host that an interrupt is waiting.


Does it really? I couldn't find this in the SVM spec.


I didn't see it in the spec either, but that's what I saw real  
hardware do which is supported by the exit_info validity check in svm.c




Let's do the same and just not EXIT_INTR if we have an event  
pending for the

L2 guest.


Anyway. I think this case is handled good enough with patch 5/5. This
patch, to be complete must also enable single-steping to exit again
after the first instruction of the exception handler has ran to inject
the interrupt. But that would make the whole thing rather compĺicate 
d.


The NMI injection path has logic for that, but for now it shouldn't  
hurt - we'll get an interrupt sooner or later.





Signed-off-by: Alexander Graf ag...@suse.de
---
arch/x86/kvm/svm.c |4 
1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 61efd13..28fcbd0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1401,6 +1401,10 @@ static inline int nested_svm_intr(struct  
vcpu_svm *svm)

   if (!(svm-vcpu.arch.hflags  HF_HIF_MASK))
   return 0;

+/* We can't EXIT_INTR when we still have an event to inject */
+if (svm-vmcb-control.event_inj)
+return 1;
+
   svm-vmcb-control.exit_code = SVM_EXIT_INTR;

   if (nested_svm_exit_handled(svm)) {
--
1.6.0.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/5] Don't call svm_complete_interrupts for nested guests

2009-09-23 Thread Gleb Natapov
On Wed, Sep 23, 2009 at 01:05:57AM -0700, Alexander Graf wrote:
 
 Am 22.09.2009 um 18:26 schrieb Joerg Roedel j...@8bytes.org:
 
 On Fri, Sep 18, 2009 at 03:00:29PM +0200, Alexander Graf wrote:
 SVM has some cleanup code, that tries to reinject interrupts and
 exceptions
 when the guest didn't manage to deal with them yet. It basically
 transfers
 them to KVM internal state.
 
 Unfortunately, the internal state is reserved for the L1 guest
 state, so we
 shouldn't try to go through that logic when running a nested guest.
 
 When doing something the host KVM can handle, let's just
 reinject the event
 into the L2 guest, because we didn't touch its state anyways.
 
 I don't really understandt what problem this patch addresses.
 There are
 situations where we have events to reinject into the l2 guest
 directly.
 But the generic reinjection code works fine for it.
 The only problematic thing with it is that it implicitly relies on
 exit_int_info not to be changed in the exit cycle (which would be
 worth
 a comment).
 
 It simply tries to be too clever. Reevaluating exceptions won't work
 for example.
 
Can you elaborate? What do you mean by too clever and why reevaluating
exceptions won't work?

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Update halted state from mpstate only in case of inkernel irq chip

2009-09-23 Thread Avi Kivity

On 09/22/2009 05:51 PM, Gleb Natapov wrote:

Otherwise cpu is always unhalted by call to kvm_arch_get_registers()
   


Applied, thanks.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Don't call kvm_cpu_synchronize_state() if there is no irqchip events to process.

2009-09-23 Thread Avi Kivity

On 09/22/2009 05:51 PM, Gleb Natapov wrote:

Signed-off-by: Gleb Natapovg...@redhat.com
   


Applied, thanks.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Don't call cpu_synchronize_state() in apic_init_reset()

2009-09-23 Thread Avi Kivity

On 09/22/2009 05:51 PM, Gleb Natapov wrote:

Each caller of the function already calls it.

Signed-off-by: Gleb Natapovg...@redhat.com
---
  hw/apic.c |1 -
  1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/hw/apic.c b/hw/apic.c
index 3a2e128..a9d1fb8 100644
--- a/hw/apic.c
+++ b/hw/apic.c
@@ -488,7 +488,6 @@ void apic_init_reset(CPUState *env)
  if (!s)
  return;

-cpu_synchronize_state(env);
  s-tpr = 0;
  s-spurious_vec = 0xff;
  s-log_dest = 0;
   


Still, it's safer to live this in.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] qemu-kvm: fix no gsi routing warning

2009-09-23 Thread Avi Kivity

On 09/21/2009 10:50 PM, Michael S. Tsirkin wrote:

On Mon, Sep 21, 2009 at 05:56:13PM +0300, Avi Kivity wrote:
   

On 09/21/2009 05:51 PM, Michael S. Tsirkin wrote:
 

When running on host kernel which does not let the guest manupulate the
gsi routing, and user requested MSI-X to be enabled, we get the
following warnings:
kvm_msix_add: kvm_get_irq_route_gsi failed: No space left on device
kvm_msix_update: kvm_update_routing_entry failed: Invalid argument

What really happens is that we report a failure to allocate
a vector to the guest, it will retry and finally disable MSI.

Make this clearer by checking for gsi capability and warning about
the error in a readable form.


   

Can we disable msix
 

What we do effectively disables msix.

   

(or, abort qemu at startup and request that the user
disable msix) if the kernel doesn't provide required features?
  It's
better than a runtime error.
 

Note it's a warning, not an error.

We have a similar failure mode when we run out of gsi entries, and that
can not be checked upfront.  I prefer having it fail in the same way, so
that this failure path in guest drivers is excercised and tested.
   


I see, applied the patch, thanks.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Possible problem with -no-kvm-irqchip -no-kvm-pit qemu options detected during git daily testing

2009-09-23 Thread Avi Kivity

On 09/21/2009 10:24 PM, Lucas Meneghel Rodrigues wrote:

Ok, tests ended, indeed a problem showed up - we still have the
problem I reported with the options -no-kvm-irqchip -no-kvm-pit *when*
are coupled with the option -smp 2.

If -smp 2 is not present, qemu works fine with the options
-no-kvm-irqchip -no-kvm-pit.

An example of command line:

/usr/local/autotest/tests/kvm/qemu -name 'vm1' -monitor
unix:/tmp/monitor-20090918-131514-TdDQ,server,nowait -drive
file=/usr/local/autotest/tests/kvm/images/fc9-64.qcow2,if=ide,boot=on
-net nic,vlan=0 -net user,vlan=0 -m 512  -smp 2 -no-kvm-irqchip
-no-kvm-pit -redir tcp:5000::22 -vnc :0

It's necessary to look further what could be causing this problem.
   


I've applied yet another patch by Gleb that should fix this.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


buildbot failure in qemu-kvm on disable_kvm_x86_64_out_of_tree

2009-09-23 Thread qemu-kvm
The Buildbot has detected a new failure of disable_kvm_x86_64_out_of_tree on 
qemu-kvm.
Full details are available at:
 
http://buildbot.b1-systems.de/qemu-kvm/builders/disable_kvm_x86_64_out_of_tree/builds/19

Buildbot URL: http://buildbot.b1-systems.de/qemu-kvm/

Buildslave for this Build: b1_qemu_kvm_1

Build Reason: 
Build Source Stamp: [branch next] HEAD
Blamelist: Avi Kivity a...@redhat.com,Gleb Natapov g...@redhat.com,Michael 
S. Tsirkin m...@redhat.com

BUILD FAILED: failed git

sincerely,
 -The Buildbot

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


buildbot failure in qemu-kvm on disable_kvm_x86_64_debian_5_0

2009-09-23 Thread qemu-kvm
The Buildbot has detected a new failure of disable_kvm_x86_64_debian_5_0 on 
qemu-kvm.
Full details are available at:
 
http://buildbot.b1-systems.de/qemu-kvm/builders/disable_kvm_x86_64_debian_5_0/builds/70

Buildbot URL: http://buildbot.b1-systems.de/qemu-kvm/

Buildslave for this Build: b1_qemu_kvm_1

Build Reason: 
Build Source Stamp: [branch next] HEAD
Blamelist: Avi Kivity a...@redhat.com,Gleb Natapov g...@redhat.com,Michael 
S. Tsirkin m...@redhat.com

BUILD FAILED: failed git

sincerely,
 -The Buildbot

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


buildbot failure in qemu-kvm on disable_kvm_i386_debian_5_0

2009-09-23 Thread qemu-kvm
The Buildbot has detected a new failure of disable_kvm_i386_debian_5_0 on 
qemu-kvm.
Full details are available at:
 
http://buildbot.b1-systems.de/qemu-kvm/builders/disable_kvm_i386_debian_5_0/builds/71

Buildbot URL: http://buildbot.b1-systems.de/qemu-kvm/

Buildslave for this Build: b1_qemu_kvm_2

Build Reason: 
Build Source Stamp: [branch next] HEAD
Blamelist: Avi Kivity a...@redhat.com,Gleb Natapov g...@redhat.com,Michael 
S. Tsirkin m...@redhat.com

BUILD FAILED: failed git

sincerely,
 -The Buildbot

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: Virtio net driver for Windows 2008 R2

2009-09-23 Thread Yan Vugenfirer
Now :)
You can compile the drivers from the
git://git.kernel.org/pub/scm/virt/kvm/kvm-guest-drivers-windows.git
Use Vista drivers for Win2008R2 if you use default build scripts. You can
also use latest WDK and compile the code for Windows 2008 R2.

Please keep in mind that you need to test sign the drivers for 64bit MS
OSes. Kernel.org policies are preventing me from keeping test certificate
in repository and signing the drivers automatically during build process.
http://msdn.microsoft.com/en-us/library/aa906344.aspx

Best regards,
Yan.


 -Original Message-
 From: kvm-ow...@vger.kernel.org [mailto:kvm-ow...@vger.kernel.org] On
 Behalf Of carlopmart
 Sent: Monday, September 21, 2009 11:35 PM
 To: kvm@vger.kernel.org
 Subject: Virtio net driver for Windows 2008 R2
 
 Hi all.
 
   When will be possible to test virtio net drivers for Windows 2008 R2
 kvm guests?
 
 Thanks.
 
 --
 CL Martinez
 carlopmart {at} gmail {d0t} com
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Test 802.1Q vlan of nic

2009-09-23 Thread Amos Kong
Test 802.1Q vlan of nic, config it by vconfig command.
1) Create two VMs
2) Setup guests in different vlan by vconfig and test communication by ping
   using hard-coded ip address
3) Setup guests in same vlan and test communication by ping
4) Recover the vlan config

Signed-off-by: Amos Kong ak...@redhat.com
---
 client/tests/kvm/kvm_tests.cfg.sample |6 +++
 client/tests/kvm/tests/vlan_tag.py|   66 +
 2 files changed, 72 insertions(+), 0 deletions(-)
 create mode 100644 client/tests/kvm/tests/vlan_tag.py

diff --git a/client/tests/kvm/kvm_tests.cfg.sample 
b/client/tests/kvm/kvm_tests.cfg.sample
index 285a38f..5a3f97d 100644
--- a/client/tests/kvm/kvm_tests.cfg.sample
+++ b/client/tests/kvm/kvm_tests.cfg.sample
@@ -145,6 +145,12 @@ variants:
 kill_vm = yes
 kill_vm_gracefully = no
 
+- vlan_tag:  install setup
+type = vlan_tag
+subnet2 = 192.168.123
+vlans = 10 20
+nic_mode = tap
+nic_model = e1000
 
 # NICs
 variants:
diff --git a/client/tests/kvm/tests/vlan_tag.py 
b/client/tests/kvm/tests/vlan_tag.py
new file mode 100644
index 000..2904276
--- /dev/null
+++ b/client/tests/kvm/tests/vlan_tag.py
@@ -0,0 +1,66 @@
+import logging, time
+from autotest_lib.client.common_lib import error
+import kvm_subprocess, kvm_test_utils, kvm_utils
+
+def run_vlan_tag(test, params, env):
+
+Test 802.1Q vlan of nic, config it by vconfig command.
+
+1) Create two VMs
+2) Setup guests in different vlan by vconfig and test communication by 
ping 
+   using hard-coded ip address
+3) Setup guests in same vlan and test communication by ping
+4) Recover the vlan config
+
+@param test: Kvm test object
+@param params: Dictionary with the test parameters.
+@param env: Dictionary with test environment.
+
+
+vm = []
+session = []
+subnet2 = params.get(subnet2)
+vlans = params.get(vlans).split()
+
+vm.append(kvm_test_utils.get_living_vm(env, %s % params.get(main_vm)))
+
+params_vm2 = params.copy()
+params_vm2['image_snapshot'] = yes
+params_vm2['kill_vm_gracefully'] = no
+params_vm2[address_index] = int(params.get(address_index, 0))+1
+vm.append(vm[0].clone(vm2, params_vm2))
+kvm_utils.env_register_vm(env, vm2, vm[1])
+if not vm[1].create():
+raise error.TestError, VM 'vm[1]' create faild
+
+for i in range(2):
+session.append(kvm_test_utils.wait_for_login(vm[i]))
+
+try:
+vconfig_cmd = vconfig add eth0 %s;ifconfig eth0.%s %s.%s
+if session[0].get_command_status(vconfig_cmd % (vlans[0],
+vlans[0],
+subnet2,
+11)) != 0 or \
+   session[1].get_command_status(vconfig_cmd % (vlans[1],
+vlans[1],
+subnet2,
+12)) != 0:
+raise error.TestError, Fail to config VMs ip address
+if session[0].get_command_status(ping -c 2 %s.12 % subnet2) == 0:
+raise error.TestFail(Guest is unexpectedly pingable in different 
+ vlan)
+
+if session[1].get_command_status(vconfig rem eth0.%s;vconfig add eth0 

+ %s;ifconfig eth0.%s %s.12 %
+  (vlans[1],
+   vlans[0],
+   vlans[0],
+   subnet2)) != 0:
+raise error.TestError, Fail to config ip address of VM 'vm[1]'
+if session[0].get_command_status(ping -c 2 %s.12 % subnet2) != 0:
+raise error.TestFail, Fail to ping the guest in same vlan
+finally:
+for i in range(2):
+session[i].sendline(vconfig rem eth0.%s % vlans[0])
+session[i].close()
-- 
1.5.5.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] [RESEND] KVM:VMX: Add support for Pause-Loop Exiting

2009-09-23 Thread Zhai, Edwin

Avi,

This is the patch to enable PLE, which depends on the a small change of 
Linux scheduler

(see http://lkml.org/lkml/2009/5/20/447).

According to our discussion last time, one missing part is that if PLE
exit, pick up an unscheduled vcpu at random and schedule it. But
further investigation found that:
1. KVM is hard to know the schedule state for each vcpu.
2. Linux scheduler has no existed API can be used to pull a specific
task to this cpu, so we need more changes to the common scheduler.
So I prefer current simple way: just give up current cpu time.

If no objection, I'll try to push common scheduler change first to
linux.

Thanks,
edwin



kvm_ple_v2.patch
Description: Binary data


Re: [PATCH] [RESEND] KVM:VMX: Add support for Pause-Loop Exiting

2009-09-23 Thread Avi Kivity

On 09/23/2009 05:04 PM, Zhai, Edwin wrote:

Avi,

This is the patch to enable PLE, which depends on the a small change 
of Linux scheduler

(see http://lkml.org/lkml/2009/5/20/447).

According to our discussion last time, one missing part is that if PLE
exit, pick up an unscheduled vcpu at random and schedule it. But
further investigation found that:
1. KVM is hard to know the schedule state for each vcpu.
2. Linux scheduler has no existed API can be used to pull a specific
task to this cpu, so we need more changes to the common scheduler.
So I prefer current simple way: just give up current cpu time.

If no objection, I'll try to push common scheduler change first to
linux.


We haven't sorted out what is the correct thing to do here.  I think we 
should go for a directed yield, but until we have it, you can use 
hrtimers to sleep for 100 microseconds and hope the holding vcpu will 
get scheduled.  Even if it doesn't, we're only wasting a few percent cpu 
time instead of spinning.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server

2009-09-23 Thread Gregory Haskins
Avi Kivity wrote:
 On 09/22/2009 12:43 AM, Ira W. Snyder wrote:

 Sure, virtio-ira and he is on his own to make a bus-model under that, or
 virtio-vbus + vbus-ira-connector to use the vbus framework.  Either
 model can work, I agree.

  
 Yes, I'm having to create my own bus model, a-la lguest, virtio-pci, and
 virtio-s390. It isn't especially easy. I can steal lots of code from the
 lguest bus model, but sometimes it is good to generalize, especially
 after the fourth implemention or so. I think this is what GHaskins tried
 to do.

 
 Yes.  vbus is more finely layered so there is less code duplication.

To clarify, Ira was correct in stating this generalizing some of these
components was one of the goals for the vbus project: IOW vbus finely
layers and defines what's below virtio, not replaces it.

You can think of a virtio-stack like this:

--
| virtio-net
--
| virtio-ring
--
| virtio-bus
--
| ? undefined ?
--

IOW: The way I see it, virtio is a device interface model only.  The
rest of it is filled in by the virtio-transport and some kind of back-end.

So today, we can complete the ? undefined ? block like this for KVM:

--
| virtio-pci
--
 |
--
| kvm.ko
--
| qemu
--
| tuntap
--

In this case, kvm.ko and tuntap are providing plumbing, and qemu is
providing a backend device model (pci-based, etc).

You can, of course, plug a different stack in (such as virtio-lguest,
virtio-ira, etc) but you are more or less on your own to recreate many
of the various facilities contained in that stack (such as things
provided by QEMU, like discovery/hotswap/addressing), as Ira is discovering.

Vbus tries to commoditize more components in the stack (like the bus
model and backend-device model) so they don't need to be redesigned each
time we solve this virtio-transport problem.  IOW: stop the
proliferation of the need for pci-bus, lguest-bus, foo-bus underneath
virtio.  Instead, we can then focus on the value add on top, like the
models themselves or the simple glue between them.

So now you might have something like

--
| virtio-vbus
--
| vbus-proxy
--
| kvm-guest-connector
--
 |
--
| kvm.ko
--
| kvm-host-connector.ko
--
| vbus.ko
--
| virtio-net-backend.ko
--

so now we don't need to worry about the bus-model or the device-model
framework.  We only need to implement the connector, etc.  This is handy
when you find yourself in an environment that doesn't support PCI (such
as Ira's rig, or userspace containers), or when you want to add features
that PCI doesn't have (such as fluid event channels for things like IPC
services, or priortizable interrupts, etc).

 
 The virtio layering was more or less dictated by Xen which doesn't have
 shared memory (it uses grant references instead).  As a matter of fact
 lguest, kvm/pci, and kvm/s390 all have shared memory, as you do, so that
 part is duplicated.  It's probably possible to add a virtio-shmem.ko
 library that people who do have shared memory can reuse.

Note that I do not believe the Xen folk use virtio, so while I can
appreciate the foresight that went into that particular aspect of the
design of the virtio model, I am not sure if its a realistic constraint.

The reason why I decided to not worry about that particular model is
twofold:

1) Trying to support non shared-memory designs is prohibitively high for
my performance goals (for instance, requiring an exit on each
-add_buf() in addition to the -kick()).

2) The Xen guys are unlikely to diverge from something like
xenbus/xennet anyway, so it would be for naught.

Therefore, I just went with a device model optimized for shared-memory
outright.

That said, I believe we can refactor what is called the
vbus-proxy-device into this virtio-shmem interface that you and
Anthony have described.  We could make the feature optional and only
support on architectures where this makes sense.

snip

Kind Regards,
-Greg



signature.asc
Description: OpenPGP digital signature


Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server

2009-09-23 Thread Avi Kivity

On 09/23/2009 05:26 PM, Gregory Haskins wrote:


   

Yes, I'm having to create my own bus model, a-la lguest, virtio-pci, and
virtio-s390. It isn't especially easy. I can steal lots of code from the
lguest bus model, but sometimes it is good to generalize, especially
after the fourth implemention or so. I think this is what GHaskins tried
to do.

   

Yes.  vbus is more finely layered so there is less code duplication.
 

To clarify, Ira was correct in stating this generalizing some of these
components was one of the goals for the vbus project: IOW vbus finely
layers and defines what's below virtio, not replaces it.

You can think of a virtio-stack like this:

--
| virtio-net
--
| virtio-ring
--
| virtio-bus
--
| ? undefined ?
--

IOW: The way I see it, virtio is a device interface model only.  The
rest of it is filled in by the virtio-transport and some kind of back-end.

So today, we can complete the ? undefined ? block like this for KVM:

--
| virtio-pci
--
  |
--
| kvm.ko
--
| qemu
--
| tuntap
--

In this case, kvm.ko and tuntap are providing plumbing, and qemu is
providing a backend device model (pci-based, etc).

You can, of course, plug a different stack in (such as virtio-lguest,
virtio-ira, etc) but you are more or less on your own to recreate many
of the various facilities contained in that stack (such as things
provided by QEMU, like discovery/hotswap/addressing), as Ira is discovering.

Vbus tries to commoditize more components in the stack (like the bus
model and backend-device model) so they don't need to be redesigned each
time we solve this virtio-transport problem.  IOW: stop the
proliferation of the need for pci-bus, lguest-bus, foo-bus underneath
virtio.  Instead, we can then focus on the value add on top, like the
models themselves or the simple glue between them.

So now you might have something like

--
| virtio-vbus
--
| vbus-proxy
--
| kvm-guest-connector
--
  |
--
| kvm.ko
--
| kvm-host-connector.ko
--
| vbus.ko
--
| virtio-net-backend.ko
--

so now we don't need to worry about the bus-model or the device-model
framework.  We only need to implement the connector, etc.  This is handy
when you find yourself in an environment that doesn't support PCI (such
as Ira's rig, or userspace containers), or when you want to add features
that PCI doesn't have (such as fluid event channels for things like IPC
services, or priortizable interrupts, etc).
   


Well, vbus does more, for example it tunnels interrupts instead of 
exposing them 1:1 on the native interface if it exists.  It also pulls 
parts of the device model into the host kernel.



The virtio layering was more or less dictated by Xen which doesn't have
shared memory (it uses grant references instead).  As a matter of fact
lguest, kvm/pci, and kvm/s390 all have shared memory, as you do, so that
part is duplicated.  It's probably possible to add a virtio-shmem.ko
library that people who do have shared memory can reuse.
 

Note that I do not believe the Xen folk use virtio, so while I can
appreciate the foresight that went into that particular aspect of the
design of the virtio model, I am not sure if its a realistic constraint.
   


Since a virtio goal was to reduce virtual device driver proliferation, 
it was necessary to accommodate Xen.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server

2009-09-23 Thread Gregory Haskins
Avi Kivity wrote:
 On 09/23/2009 05:26 PM, Gregory Haskins wrote:

   
 Yes, I'm having to create my own bus model, a-la lguest, virtio-pci,
 and
 virtio-s390. It isn't especially easy. I can steal lots of code from
 the
 lguest bus model, but sometimes it is good to generalize, especially
 after the fourth implemention or so. I think this is what GHaskins
 tried
 to do.


 Yes.  vbus is more finely layered so there is less code duplication.
  
 To clarify, Ira was correct in stating this generalizing some of these
 components was one of the goals for the vbus project: IOW vbus finely
 layers and defines what's below virtio, not replaces it.

 You can think of a virtio-stack like this:

 --
 | virtio-net
 --
 | virtio-ring
 --
 | virtio-bus
 --
 | ? undefined ?
 --

 IOW: The way I see it, virtio is a device interface model only.  The
 rest of it is filled in by the virtio-transport and some kind of
 back-end.

 So today, we can complete the ? undefined ? block like this for KVM:

 --
 | virtio-pci
 --
   |
 --
 | kvm.ko
 --
 | qemu
 --
 | tuntap
 --

 In this case, kvm.ko and tuntap are providing plumbing, and qemu is
 providing a backend device model (pci-based, etc).

 You can, of course, plug a different stack in (such as virtio-lguest,
 virtio-ira, etc) but you are more or less on your own to recreate many
 of the various facilities contained in that stack (such as things
 provided by QEMU, like discovery/hotswap/addressing), as Ira is
 discovering.

 Vbus tries to commoditize more components in the stack (like the bus
 model and backend-device model) so they don't need to be redesigned each
 time we solve this virtio-transport problem.  IOW: stop the
 proliferation of the need for pci-bus, lguest-bus, foo-bus underneath
 virtio.  Instead, we can then focus on the value add on top, like the
 models themselves or the simple glue between them.

 So now you might have something like

 --
 | virtio-vbus
 --
 | vbus-proxy
 --
 | kvm-guest-connector
 --
   |
 --
 | kvm.ko
 --
 | kvm-host-connector.ko
 --
 | vbus.ko
 --
 | virtio-net-backend.ko
 --

 so now we don't need to worry about the bus-model or the device-model
 framework.  We only need to implement the connector, etc.  This is handy
 when you find yourself in an environment that doesn't support PCI (such
 as Ira's rig, or userspace containers), or when you want to add features
 that PCI doesn't have (such as fluid event channels for things like IPC
 services, or priortizable interrupts, etc).

 
 Well, vbus does more, for example it tunnels interrupts instead of
 exposing them 1:1 on the native interface if it exists.

As I've previously explained, that trait is a function of the
kvm-connector I've chosen to implement, not of the overall design of vbus.

The reason why my kvm-connector is designed that way is because my early
testing/benchmarking shows one of the issues in KVM performance is the
ratio of exits per IO operation are fairly high, especially as your
scale io-load.  Therefore, the connector achieves a substantial
reduction in that ratio by treating interrupts to the same kind of
benefits that NAPI brought to general networking: That is, we enqueue
interrupt messages into a lockless ring and only hit the IDT for the
first occurrence.  Subsequent interrupts are injected in a
parallel/lockless manner, without hitting the IDT nor incurring an extra
EOI.  This pays dividends as the IO rate increases, which is when the
guest needs the most help.

OTOH, it is entirely possible to design the connector such that we
maintain a 1:1 ratio of signals to traditional IDT interrupts.  It is
also possible to design a connector which surfaces as something else,
such as PCI devices (by terminating the connector in QEMU and utilizing
its PCI emulation facilities), which would naturally employ 1:1 mapping.

So if 1:1 mapping is a critical feature (I would argue to the contrary),
vbus can support it.

 It also pulls parts of the device model into the host kernel.

That is the point.  Most of it needs to be there for performance.  And
what doesn't need to be there for performance can either be:

a) skipped at the discretion of the connector/device-model designer

OR

b) included because its trivially small subset of the model (e.g. a
mac-addr attribute) and its nice to have a cohesive solution instead of
requiring a separate binary blob that can get out of sync, etc.

The example Ive provided to date (venet on kvm) utilizes (b), but it
certainly 

Re: [PATCH] Don't call cpu_synchronize_state() in apic_init_reset()

2009-09-23 Thread Jan Kiszka
Avi Kivity wrote:
 On 09/22/2009 05:51 PM, Gleb Natapov wrote:
 Each caller of the function already calls it.

 Signed-off-by: Gleb Natapovg...@redhat.com
 ---
   hw/apic.c |1 -
   1 files changed, 0 insertions(+), 1 deletions(-)

 diff --git a/hw/apic.c b/hw/apic.c
 index 3a2e128..a9d1fb8 100644
 --- a/hw/apic.c
 +++ b/hw/apic.c
 @@ -488,7 +488,6 @@ void apic_init_reset(CPUState *env)
   if (!s)
   return;

 -cpu_synchronize_state(env);
   s-tpr = 0;
   s-spurious_vec = 0xff;
   s-log_dest = 0;

 
 Still, it's safer to live this in.
 

Yet another diff to upstream...

It's really time to stabilize this still a bit fuzzy register sync
model, also in qemu-kvm. If you think we need it, let us push it
upstream - with a sound explanation, and maybe even with more sync
points for the sake of consistency.

Jan



signature.asc
Description: OpenPGP digital signature


Re: [PATCH] Don't call cpu_synchronize_state() in apic_init_reset()

2009-09-23 Thread Avi Kivity

On 09/23/2009 06:07 PM, Jan Kiszka wrote:

Avi Kivity wrote:
   

On 09/22/2009 05:51 PM, Gleb Natapov wrote:
 

Each caller of the function already calls it.

Signed-off-by: Gleb Natapovg...@redhat.com
---
   hw/apic.c |1 -
   1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/hw/apic.c b/hw/apic.c
index 3a2e128..a9d1fb8 100644
--- a/hw/apic.c
+++ b/hw/apic.c
@@ -488,7 +488,6 @@ void apic_init_reset(CPUState *env)
   if (!s)
   return;

-cpu_synchronize_state(env);
   s-tpr = 0;
   s-spurious_vec = 0xff;
   s-log_dest = 0;

   

Still, it's safer to live this in.

 

Yet another diff to upstream...

It's really time to stabilize this still a bit fuzzy register sync
model, also in qemu-kvm. If you think we need it, let us push it
upstream - with a sound explanation, and maybe even with more sync
points for the sake of consistency.

   


Functions calling each other in the same subsystem can rely on callers 
calling cpu_synchronize_state().  Across subsystems, that's another 
matter, exported functions should try not to rely on implementation 
details of their callers.


(You might argue that the apic is not separate subsystem wrt an x86 cpu, 
and I'm not sure I have a counterargument)


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Don't call cpu_synchronize_state() in apic_init_reset()

2009-09-23 Thread Jan Kiszka
Avi Kivity wrote:
 On 09/23/2009 06:07 PM, Jan Kiszka wrote:
 Avi Kivity wrote:
   
 On 09/22/2009 05:51 PM, Gleb Natapov wrote:
 
 Each caller of the function already calls it.

 Signed-off-by: Gleb Natapovg...@redhat.com
 ---
hw/apic.c |1 -
1 files changed, 0 insertions(+), 1 deletions(-)

 diff --git a/hw/apic.c b/hw/apic.c
 index 3a2e128..a9d1fb8 100644
 --- a/hw/apic.c
 +++ b/hw/apic.c
 @@ -488,7 +488,6 @@ void apic_init_reset(CPUState *env)
if (!s)
return;

 -cpu_synchronize_state(env);
s-tpr = 0;
s-spurious_vec = 0xff;
s-log_dest = 0;


 Still, it's safer to live this in.

  
 Yet another diff to upstream...

 It's really time to stabilize this still a bit fuzzy register sync
 model, also in qemu-kvm. If you think we need it, let us push it
 upstream - with a sound explanation, and maybe even with more sync
 points for the sake of consistency.


 
 Functions calling each other in the same subsystem can rely on callers
 calling cpu_synchronize_state().  Across subsystems, that's another
 matter, exported functions should try not to rely on implementation
 details of their callers.
 
 (You might argue that the apic is not separate subsystem wrt an x86 cpu,
 and I'm not sure I have a counterargument)
 

I do accept this argument. It's just that my feeling is that we are
lacking proper review of the required call sites of cpu_sychronize_state
and rather put it where some regression popped up (and that only in
qemu-kvm...).

The new rule is: Synchronize the states before accessing registers (or
in-kernel devices) the first time after a vmexit to user space. But,
e.g., I do not see where we do this on CPU reset.

Jan



signature.asc
Description: OpenPGP digital signature


sync guest calls made async on host - SQLite performance

2009-09-23 Thread Matthew Tippett

Hi,

I would like to call attention to the SQLite performance under KVM in 
the current Ubuntu Alpha.


http://www.phoronix.com/scan.php?page=articleitem=linux_2631_kvmnum=3

SQLite's benchmark as part of the Phoronix Test Suite is typically IO 
limited and is affected by both disk and filesystem performance.


When comparing SQLite under the host against the guest OS,  there is an 
order of magnitude _IMPROVEMENT_ in the measured performance  of the guest.


I am expecting that the host is doing synchronous IO operations but 
somewhere in the stack the calls are ultimately being made asynchronous 
or at the very least batched for writing.


On the surface, this represents a data integrity issue and  I am 
interested in the KVM communities thoughts on this behaviour.  Is it 
expected? Is it acceptable?  Is it safe?


Regards,

Matthew
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server

2009-09-23 Thread Gregory Haskins
Gregory Haskins wrote:
 Avi Kivity wrote:
 On 09/23/2009 05:26 PM, Gregory Haskins wrote:
   
 Yes, I'm having to create my own bus model, a-la lguest, virtio-pci,
 and
 virtio-s390. It isn't especially easy. I can steal lots of code from
 the
 lguest bus model, but sometimes it is good to generalize, especially
 after the fourth implemention or so. I think this is what GHaskins
 tried
 to do.


 Yes.  vbus is more finely layered so there is less code duplication.
  
 To clarify, Ira was correct in stating this generalizing some of these
 components was one of the goals for the vbus project: IOW vbus finely
 layers and defines what's below virtio, not replaces it.

 You can think of a virtio-stack like this:

 --
 | virtio-net
 --
 | virtio-ring
 --
 | virtio-bus
 --
 | ? undefined ?
 --

 IOW: The way I see it, virtio is a device interface model only.  The
 rest of it is filled in by the virtio-transport and some kind of
 back-end.

 So today, we can complete the ? undefined ? block like this for KVM:

 --
 | virtio-pci
 --
   |
 --
 | kvm.ko
 --
 | qemu
 --
 | tuntap
 --

 In this case, kvm.ko and tuntap are providing plumbing, and qemu is
 providing a backend device model (pci-based, etc).

 You can, of course, plug a different stack in (such as virtio-lguest,
 virtio-ira, etc) but you are more or less on your own to recreate many
 of the various facilities contained in that stack (such as things
 provided by QEMU, like discovery/hotswap/addressing), as Ira is
 discovering.

 Vbus tries to commoditize more components in the stack (like the bus
 model and backend-device model) so they don't need to be redesigned each
 time we solve this virtio-transport problem.  IOW: stop the
 proliferation of the need for pci-bus, lguest-bus, foo-bus underneath
 virtio.  Instead, we can then focus on the value add on top, like the
 models themselves or the simple glue between them.

 So now you might have something like

 --
 | virtio-vbus
 --
 | vbus-proxy
 --
 | kvm-guest-connector
 --
   |
 --
 | kvm.ko
 --
 | kvm-host-connector.ko
 --
 | vbus.ko
 --
 | virtio-net-backend.ko
 --

 so now we don't need to worry about the bus-model or the device-model
 framework.  We only need to implement the connector, etc.  This is handy
 when you find yourself in an environment that doesn't support PCI (such
 as Ira's rig, or userspace containers), or when you want to add features
 that PCI doesn't have (such as fluid event channels for things like IPC
 services, or priortizable interrupts, etc).

 Well, vbus does more, for example it tunnels interrupts instead of
 exposing them 1:1 on the native interface if it exists.
 
 As I've previously explained, that trait is a function of the
 kvm-connector I've chosen to implement, not of the overall design of vbus.
 
 The reason why my kvm-connector is designed that way is because my early
 testing/benchmarking shows one of the issues in KVM performance is the
 ratio of exits per IO operation are fairly high, especially as your
 scale io-load.  Therefore, the connector achieves a substantial
 reduction in that ratio by treating interrupts to the same kind of
 benefits that NAPI brought to general networking: That is, we enqueue
 interrupt messages into a lockless ring and only hit the IDT for the
 first occurrence.  Subsequent interrupts are injected in a
 parallel/lockless manner, without hitting the IDT nor incurring an extra
 EOI.  This pays dividends as the IO rate increases, which is when the
 guest needs the most help.
 
 OTOH, it is entirely possible to design the connector such that we
 maintain a 1:1 ratio of signals to traditional IDT interrupts.  It is
 also possible to design a connector which surfaces as something else,
 such as PCI devices (by terminating the connector in QEMU and utilizing
 its PCI emulation facilities), which would naturally employ 1:1 mapping.
 
 So if 1:1 mapping is a critical feature (I would argue to the contrary),
 vbus can support it.
 
 It also pulls parts of the device model into the host kernel.
 
 That is the point.  Most of it needs to be there for performance.

To clarify this point:

There are various aspects about designing high-performance virtual
devices such as providing the shortest paths possible between the
physical resources and the consumers.  Conversely, we also need to
ensure that we meet proper isolation/protection guarantees at the same
time.  What this means is there are various aspects to any
high-performance PV 

[PATCH 0/3] ksm support for kvm v2

2009-09-23 Thread Izik Eidus
Hope i fixed everything i was asked...
please tell me if I forgot anything.

Izik Eidus (3):
  kvm: dont hold pagecount reference for mapped sptes pages
  add SPTE_HOST_WRITEABLE flag to the shadow ptes
  add support for change_pte mmu notifiers

 arch/x86/include/asm/kvm_host.h |1 +
 arch/x86/kvm/mmu.c  |   86 ++
 arch/x86/kvm/paging_tmpl.h  |   18 +++-
 virt/kvm/kvm_main.c |   14 ++
 4 files changed, 98 insertions(+), 21 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] kvm: dont hold pagecount reference for mapped sptes pages

2009-09-23 Thread Izik Eidus
When using mmu notifiers, we are allowed to remove the page count
reference tooken by get_user_pages to a specific page that is mapped
inside the shadow page tables.

This is needed so we can balance the pagecount against mapcount
checking.

(Right now kvm increase the pagecount and does not increase the
mapcount when mapping page into shadow page table entry,
so when comparing pagecount against mapcount, you have no
reliable result.)

Signed-off-by: Izik Eidus iei...@redhat.com
---
 arch/x86/kvm/mmu.c |7 ++-
 1 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index eca41ae..6c67b23 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -634,9 +634,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
if (*spte  shadow_accessed_mask)
kvm_set_pfn_accessed(pfn);
if (is_writeble_pte(*spte))
-   kvm_release_pfn_dirty(pfn);
-   else
-   kvm_release_pfn_clean(pfn);
+   kvm_set_pfn_dirty(pfn);
rmapp = gfn_to_rmap(kvm, sp-gfns[spte - sp-spt], sp-role.level);
if (!*rmapp) {
printk(KERN_ERR rmap_remove: %p %llx 0-BUG\n, spte, *spte);
@@ -1877,8 +1875,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 
*sptep,
page_header_update_slot(vcpu-kvm, sptep, gfn);
if (!was_rmapped) {
rmap_count = rmap_add(vcpu, sptep, gfn);
-   if (!is_rmap_spte(*sptep))
-   kvm_release_pfn_clean(pfn);
+   kvm_release_pfn_clean(pfn);
if (rmap_count  RMAP_RECYCLE_THRESHOLD)
rmap_recycle(vcpu, sptep, gfn);
} else {
-- 
1.5.6.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] add SPTE_HOST_WRITEABLE flag to the shadow ptes

2009-09-23 Thread Izik Eidus
this flag notify that the host physical page we are pointing to from
the spte is write protected, and therefore we cant change its access
to be write unless we run get_user_pages(write = 1).

(this is needed for change_pte support in kvm)

Signed-off-by: Izik Eidus iei...@redhat.com
---
 arch/x86/kvm/mmu.c |   15 +++
 arch/x86/kvm/paging_tmpl.h |   18 +++---
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6c67b23..5cd8b4e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -156,6 +156,8 @@ module_param(oos_shadow, bool, 0644);
 #define CREATE_TRACE_POINTS
 #include mmutrace.h
 
+#define SPTE_HOST_WRITEABLE (1ULL  PT_FIRST_AVAIL_BITS_SHIFT)
+
 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 
 struct kvm_rmap_desc {
@@ -1754,7 +1756,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int user_fault,
int write_fault, int dirty, int level,
gfn_t gfn, pfn_t pfn, bool speculative,
-   bool can_unsync)
+   bool can_unsync, bool reset_host_protection)
 {
u64 spte;
int ret = 0;
@@ -1781,6 +1783,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= kvm_x86_ops-get_mt_mask(vcpu, gfn,
kvm_is_mmio_pfn(pfn));
 
+   if (reset_host_protection)
+   spte |= SPTE_HOST_WRITEABLE;
+
spte |= (u64)pfn  PAGE_SHIFT;
 
if ((pte_access  ACC_WRITE_MASK)
@@ -1826,7 +1831,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 
*sptep,
 unsigned pt_access, unsigned pte_access,
 int user_fault, int write_fault, int dirty,
 int *ptwrite, int level, gfn_t gfn,
-pfn_t pfn, bool speculative)
+pfn_t pfn, bool speculative,
+bool reset_host_protection)
 {
int was_rmapped = 0;
int was_writeble = is_writeble_pte(*sptep);
@@ -1858,7 +1864,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 
*sptep,
}
 
if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
- dirty, level, gfn, pfn, speculative, true)) {
+ dirty, level, gfn, pfn, speculative, true,
+ reset_host_protection)) {
if (write_fault)
*ptwrite = 1;
kvm_x86_ops-tlb_flush(vcpu);
@@ -1906,7 +1913,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, 
int write,
if (iterator.level == level) {
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
 0, write, 1, pt_write,
-level, gfn, pfn, false);
+level, gfn, pfn, false, true);
++vcpu-stat.pf_fixed;
break;
}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index d2fec9c..cfd2424 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -273,9 +273,13 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu_page *page,
if (mmu_notifier_retry(vcpu, vcpu-arch.update_pte.mmu_seq))
return;
kvm_get_pfn(pfn);
+   /*
+* we call mmu_set_spte() with reset_host_protection = true beacuse that
+* vcpu-arch.update_pte.pfn was fetched from get_user_pages(write = 1).
+*/
mmu_set_spte(vcpu, spte, page-role.access, pte_access, 0, 0,
 gpte  PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL,
-gpte_to_gfn(gpte), pfn, true);
+gpte_to_gfn(gpte), pfn, true, true);
 }
 
 /*
@@ -308,7 +312,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 user_fault, write_fault,
 gw-ptes[gw-level-1]  PT_DIRTY_MASK,
 ptwrite, level,
-gw-gfn, pfn, false);
+gw-gfn, pfn, false, true);
break;
}
 
@@ -558,6 +562,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
 static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
int i, offset, nr_present;
+   bool reset_host_protection;
 
offset = nr_present = 0;
 
@@ -595,9 +600,16 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct 
kvm_mmu_page *sp)
 
nr_present++;
pte_access = sp-role.access  FNAME(gpte_access)(vcpu, gpte);
+   if (!(sp-spt[i]  SPTE_HOST_WRITEABLE)) {
+   pte_access = ~ACC_WRITE_MASK;
+   reset_host_protection = 0;
+   } else {
+ 

[PATCH 3/3] add support for change_pte mmu notifiers

2009-09-23 Thread Izik Eidus
this is needed for kvm if it want ksm to directly map pages into its
shadow page tables.

Signed-off-by: Izik Eidus iei...@redhat.com
---
 arch/x86/include/asm/kvm_host.h |1 +
 arch/x86/kvm/mmu.c  |   64 +-
 virt/kvm/kvm_main.c |   14 
 3 files changed, 70 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3be0004..d838922 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -796,6 +796,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5cd8b4e..0905ca2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -748,7 +748,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
return write_protected;
 }
 
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data)
 {
u64 *spte;
int need_tlb_flush = 0;
@@ -763,8 +763,47 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long 
*rmapp)
return need_tlb_flush;
 }
 
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- int (*handler)(struct kvm *kvm, unsigned long *rmapp))
+static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data)
+{
+   int need_flush = 0;
+   u64 *spte, new_spte;
+   pte_t *ptep = (pte_t *)data;
+   pfn_t new_pfn;
+
+   WARN_ON(pte_huge(*ptep));
+   new_pfn = pte_pfn(*ptep);
+   spte = rmap_next(kvm, rmapp, NULL);
+   while (spte) {
+   BUG_ON(!is_shadow_present_pte(*spte));
+   rmap_printk(kvm_set_pte_rmapp: spte %p %llx\n, spte, *spte);
+   need_flush = 1;
+   if (pte_write(*ptep)) {
+   rmap_remove(kvm, spte);
+   __set_spte(spte, shadow_trap_nonpresent_pte);
+   spte = rmap_next(kvm, rmapp, NULL);
+   } else {
+   new_spte = *spte ~ (PT64_BASE_ADDR_MASK);
+   new_spte |= new_pfn  PAGE_SHIFT;
+
+   if (!pte_write(*ptep)) {
+   new_spte = ~PT_WRITABLE_MASK;
+   new_spte = ~SPTE_HOST_WRITEABLE;
+   if (is_writeble_pte(*spte))
+   kvm_set_pfn_dirty(spte_to_pfn(*spte));
+   }
+   __set_spte(spte, new_spte);
+   spte = rmap_next(kvm, rmapp, spte);
+   }
+   }
+   if (need_flush)
+   kvm_flush_remote_tlbs(kvm);
+
+   return 0;
+}
+
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, u64 data,
+ int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+u64 data))
 {
int i, j;
int retval = 0;
@@ -786,13 +825,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long 
hva,
if (hva = start  hva  end) {
gfn_t gfn_offset = (hva - start)  PAGE_SHIFT;
 
-   retval |= handler(kvm, memslot-rmap[gfn_offset]);
+   retval |= handler(kvm, memslot-rmap[gfn_offset],
+ data);
 
for (j = 0; j  KVM_NR_PAGE_SIZES - 1; ++j) {
int idx = gfn_offset;
idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + 
j);
retval |= handler(kvm,
-   memslot-lpage_info[j][idx].rmap_pde);
+   memslot-lpage_info[j][idx].rmap_pde,
+   data);
}
}
}
@@ -802,10 +843,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long 
hva,
 
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
 {
-   return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+   return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+   kvm_handle_hva(kvm, hva, (u64)pte, kvm_set_pte_rmapp);
 }
 
-static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data)
 {
u64 *spte;
int young = 0;
@@ -841,13 +887,13 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 
*spte, gfn_t gfn)
   

Re: [PATCH 3/3] add support for change_pte mmu notifiers

2009-09-23 Thread Izik Eidus

Izik Eidus wrote:

this is needed for kvm if it want ksm to directly map pages into its
shadow page tables.

Signed-off-by: Izik Eidus iei...@redhat.com
---
 arch/x86/include/asm/kvm_host.h |1 +
 arch/x86/kvm/mmu.c  |   64 +-
 virt/kvm/kvm_main.c |   14 
 3 files changed, 70 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3be0004..d838922 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -796,6 +796,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5cd8b4e..0905ca2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -748,7 +748,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
return write_protected;
 }
 
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)

+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data)
 {
u64 *spte;
int need_tlb_flush = 0;
@@ -763,8 +763,47 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long 
*rmapp)
return need_tlb_flush;
 }
 
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,

- int (*handler)(struct kvm *kvm, unsigned long *rmapp))
+static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data)
+{
+   int need_flush = 0;
+   u64 *spte, new_spte;
+   pte_t *ptep = (pte_t *)data;
+   pfn_t new_pfn;
+
+   WARN_ON(pte_huge(*ptep));
+   new_pfn = pte_pfn(*ptep);
+   spte = rmap_next(kvm, rmapp, NULL);
+   while (spte) {
+   BUG_ON(!is_shadow_present_pte(*spte));
+   rmap_printk(kvm_set_pte_rmapp: spte %p %llx\n, spte, *spte);
+   need_flush = 1;
+   if (pte_write(*ptep)) {
+   rmap_remove(kvm, spte);
+   __set_spte(spte, shadow_trap_nonpresent_pte);
+   spte = rmap_next(kvm, rmapp, NULL);
+   } else {
+   new_spte = *spte ~ (PT64_BASE_ADDR_MASK);
+   new_spte |= new_pfn  PAGE_SHIFT;
+
+   if (!pte_write(*ptep)) {
  


Just noticed that this if is not needed (we got to get here if we had 
if (pte_write(*ptep)) { few lines before...)..

I will resend

+   new_spte = ~PT_WRITABLE_MASK;
+   new_spte = ~SPTE_HOST_WRITEABLE;
+   if (is_writeble_pte(*spte))
+   kvm_set_pfn_dirty(spte_to_pfn(*spte));
+   }
+   __set_spte(spte, new_spte);
+   spte = rmap_next(kvm, rmapp, spte);
+   }
+   }
+   if (need_flush)
+   kvm_flush_remote_tlbs(kvm);
+


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/3] kvm ksm support v3

2009-09-23 Thread Izik Eidus
Change from v2 : remove unused if.

Thanks.

Izik Eidus (3):
  kvm: dont hold pagecount reference for mapped sptes pages
  add SPTE_HOST_WRITEABLE flag to the shadow ptes
  add support for change_pte mmu notifiers

 arch/x86/include/asm/kvm_host.h |1 +
 arch/x86/kvm/mmu.c  |   84 ++
 arch/x86/kvm/paging_tmpl.h  |   18 +++-
 virt/kvm/kvm_main.c |   14 ++
 4 files changed, 96 insertions(+), 21 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] kvm: dont hold pagecount reference for mapped sptes pages

2009-09-23 Thread Izik Eidus
When using mmu notifiers, we are allowed to remove the page count
reference tooken by get_user_pages to a specific page that is mapped
inside the shadow page tables.

This is needed so we can balance the pagecount against mapcount
checking.

(Right now kvm increase the pagecount and does not increase the
mapcount when mapping page into shadow page table entry,
so when comparing pagecount against mapcount, you have no
reliable result.)

Signed-off-by: Izik Eidus iei...@redhat.com
---
 arch/x86/kvm/mmu.c |7 ++-
 1 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index eca41ae..6c67b23 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -634,9 +634,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
if (*spte  shadow_accessed_mask)
kvm_set_pfn_accessed(pfn);
if (is_writeble_pte(*spte))
-   kvm_release_pfn_dirty(pfn);
-   else
-   kvm_release_pfn_clean(pfn);
+   kvm_set_pfn_dirty(pfn);
rmapp = gfn_to_rmap(kvm, sp-gfns[spte - sp-spt], sp-role.level);
if (!*rmapp) {
printk(KERN_ERR rmap_remove: %p %llx 0-BUG\n, spte, *spte);
@@ -1877,8 +1875,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 
*sptep,
page_header_update_slot(vcpu-kvm, sptep, gfn);
if (!was_rmapped) {
rmap_count = rmap_add(vcpu, sptep, gfn);
-   if (!is_rmap_spte(*sptep))
-   kvm_release_pfn_clean(pfn);
+   kvm_release_pfn_clean(pfn);
if (rmap_count  RMAP_RECYCLE_THRESHOLD)
rmap_recycle(vcpu, sptep, gfn);
} else {
-- 
1.5.6.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] add SPTE_HOST_WRITEABLE flag to the shadow ptes

2009-09-23 Thread Izik Eidus
this flag notify that the host physical page we are pointing to from
the spte is write protected, and therefore we cant change its access
to be write unless we run get_user_pages(write = 1).

(this is needed for change_pte support in kvm)

Signed-off-by: Izik Eidus iei...@redhat.com
---
 arch/x86/kvm/mmu.c |   15 +++
 arch/x86/kvm/paging_tmpl.h |   18 +++---
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6c67b23..5cd8b4e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -156,6 +156,8 @@ module_param(oos_shadow, bool, 0644);
 #define CREATE_TRACE_POINTS
 #include mmutrace.h
 
+#define SPTE_HOST_WRITEABLE (1ULL  PT_FIRST_AVAIL_BITS_SHIFT)
+
 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 
 struct kvm_rmap_desc {
@@ -1754,7 +1756,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int user_fault,
int write_fault, int dirty, int level,
gfn_t gfn, pfn_t pfn, bool speculative,
-   bool can_unsync)
+   bool can_unsync, bool reset_host_protection)
 {
u64 spte;
int ret = 0;
@@ -1781,6 +1783,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= kvm_x86_ops-get_mt_mask(vcpu, gfn,
kvm_is_mmio_pfn(pfn));
 
+   if (reset_host_protection)
+   spte |= SPTE_HOST_WRITEABLE;
+
spte |= (u64)pfn  PAGE_SHIFT;
 
if ((pte_access  ACC_WRITE_MASK)
@@ -1826,7 +1831,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 
*sptep,
 unsigned pt_access, unsigned pte_access,
 int user_fault, int write_fault, int dirty,
 int *ptwrite, int level, gfn_t gfn,
-pfn_t pfn, bool speculative)
+pfn_t pfn, bool speculative,
+bool reset_host_protection)
 {
int was_rmapped = 0;
int was_writeble = is_writeble_pte(*sptep);
@@ -1858,7 +1864,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 
*sptep,
}
 
if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
- dirty, level, gfn, pfn, speculative, true)) {
+ dirty, level, gfn, pfn, speculative, true,
+ reset_host_protection)) {
if (write_fault)
*ptwrite = 1;
kvm_x86_ops-tlb_flush(vcpu);
@@ -1906,7 +1913,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, 
int write,
if (iterator.level == level) {
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
 0, write, 1, pt_write,
-level, gfn, pfn, false);
+level, gfn, pfn, false, true);
++vcpu-stat.pf_fixed;
break;
}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index d2fec9c..cfd2424 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -273,9 +273,13 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu_page *page,
if (mmu_notifier_retry(vcpu, vcpu-arch.update_pte.mmu_seq))
return;
kvm_get_pfn(pfn);
+   /*
+* we call mmu_set_spte() with reset_host_protection = true beacuse that
+* vcpu-arch.update_pte.pfn was fetched from get_user_pages(write = 1).
+*/
mmu_set_spte(vcpu, spte, page-role.access, pte_access, 0, 0,
 gpte  PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL,
-gpte_to_gfn(gpte), pfn, true);
+gpte_to_gfn(gpte), pfn, true, true);
 }
 
 /*
@@ -308,7 +312,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 user_fault, write_fault,
 gw-ptes[gw-level-1]  PT_DIRTY_MASK,
 ptwrite, level,
-gw-gfn, pfn, false);
+gw-gfn, pfn, false, true);
break;
}
 
@@ -558,6 +562,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
 static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
int i, offset, nr_present;
+   bool reset_host_protection;
 
offset = nr_present = 0;
 
@@ -595,9 +600,16 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct 
kvm_mmu_page *sp)
 
nr_present++;
pte_access = sp-role.access  FNAME(gpte_access)(vcpu, gpte);
+   if (!(sp-spt[i]  SPTE_HOST_WRITEABLE)) {
+   pte_access = ~ACC_WRITE_MASK;
+   reset_host_protection = 0;
+   } else {
+ 

[PATCH 3/3] add support for change_pte mmu notifiers

2009-09-23 Thread Izik Eidus
this is needed for kvm if it want ksm to directly map pages into its
shadow page tables.

Signed-off-by: Izik Eidus iei...@redhat.com
---
 arch/x86/include/asm/kvm_host.h |1 +
 arch/x86/kvm/mmu.c  |   62 +-
 virt/kvm/kvm_main.c |   14 +
 3 files changed, 68 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3be0004..d838922 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -796,6 +796,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5cd8b4e..ceec065 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -748,7 +748,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
return write_protected;
 }
 
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data)
 {
u64 *spte;
int need_tlb_flush = 0;
@@ -763,8 +763,45 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long 
*rmapp)
return need_tlb_flush;
 }
 
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- int (*handler)(struct kvm *kvm, unsigned long *rmapp))
+static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data)
+{
+   int need_flush = 0;
+   u64 *spte, new_spte;
+   pte_t *ptep = (pte_t *)data;
+   pfn_t new_pfn;
+
+   WARN_ON(pte_huge(*ptep));
+   new_pfn = pte_pfn(*ptep);
+   spte = rmap_next(kvm, rmapp, NULL);
+   while (spte) {
+   BUG_ON(!is_shadow_present_pte(*spte));
+   rmap_printk(kvm_set_pte_rmapp: spte %p %llx\n, spte, *spte);
+   need_flush = 1;
+   if (pte_write(*ptep)) {
+   rmap_remove(kvm, spte);
+   __set_spte(spte, shadow_trap_nonpresent_pte);
+   spte = rmap_next(kvm, rmapp, NULL);
+   } else {
+   new_spte = *spte ~ (PT64_BASE_ADDR_MASK);
+   new_spte |= new_pfn  PAGE_SHIFT;
+
+   new_spte = ~PT_WRITABLE_MASK;
+   new_spte = ~SPTE_HOST_WRITEABLE;
+   if (is_writeble_pte(*spte))
+   kvm_set_pfn_dirty(spte_to_pfn(*spte));
+   __set_spte(spte, new_spte);
+   spte = rmap_next(kvm, rmapp, spte);
+   }
+   }
+   if (need_flush)
+   kvm_flush_remote_tlbs(kvm);
+
+   return 0;
+}
+
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, u64 data,
+ int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+u64 data))
 {
int i, j;
int retval = 0;
@@ -786,13 +823,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long 
hva,
if (hva = start  hva  end) {
gfn_t gfn_offset = (hva - start)  PAGE_SHIFT;
 
-   retval |= handler(kvm, memslot-rmap[gfn_offset]);
+   retval |= handler(kvm, memslot-rmap[gfn_offset],
+ data);
 
for (j = 0; j  KVM_NR_PAGE_SIZES - 1; ++j) {
int idx = gfn_offset;
idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + 
j);
retval |= handler(kvm,
-   memslot-lpage_info[j][idx].rmap_pde);
+   memslot-lpage_info[j][idx].rmap_pde,
+   data);
}
}
}
@@ -802,10 +841,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long 
hva,
 
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
 {
-   return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+   return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+   kvm_handle_hva(kvm, hva, (u64)pte, kvm_set_pte_rmapp);
 }
 
-static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data)
 {
u64 *spte;
int young = 0;
@@ -841,13 +885,13 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 
*spte, gfn_t gfn)
gfn = unalias_gfn(vcpu-kvm, gfn);
rmapp = gfn_to_rmap(vcpu-kvm, gfn, sp-role.level);
 
- 

Re: [PATCH] x86: Pick up local arch trace header

2009-09-23 Thread Jorge Lucángeli Obes
On Sun, Sep 20, 2009 at 3:49 PM, Aidan Marks ai...@cisco.com wrote:

 Ah, right. ok.  hmm, still getting these symbol warnings:

 vger kvm-kmod # git checkout -b queue origin/queue
 M       linux-2.6
 Branch queue set up to track remote branch queue from origin.
 Switched to a new branch 'queue'
 vger kvm-kmod # git submodule update
 Submodule path 'linux-2.6': checked out
 'abb015ac65852287c7a7c243c8cdee966a38854d'
 vger kvm-kmod # ./configure --kerneldir=/usr/src/linux
 vger kvm-kmod # make sync
 ./sync -v  v2.6.31-rc3-4139-gabb015a -l ./linux-2.6
 vger kvm-kmod # make
 make -C /usr/src/linux M=`pwd` \
                LINUXINCLUDE=-I`pwd`/include -Iinclude \
                 -Iarch/x86/include -I`pwd`/include-compat \
                -include include/linux/autoconf.h \
                -include `pwd`/x86/external-module-compat.h  \
                $@
 make[1]: Entering directory `/usr/src/linux-2.6.31-gentoo'
  CC [M]  /tmp/kvm-kmod/x86/svm.o
  CC [M]  /tmp/kvm-kmod/x86/vmx.o
  CC [M]  /tmp/kvm-kmod/x86/vmx-debug.o
  CC [M]  /tmp/kvm-kmod/x86/kvm_main.o
  CC [M]  /tmp/kvm-kmod/x86/x86.o
  CC [M]  /tmp/kvm-kmod/x86/mmu.o
  CC [M]  /tmp/kvm-kmod/x86/emulate.o
  CC [M]  /tmp/kvm-kmod/x86/../anon_inodes.o
  CC [M]  /tmp/kvm-kmod/x86/irq.o
  CC [M]  /tmp/kvm-kmod/x86/i8259.o
  CC [M]  /tmp/kvm-kmod/x86/lapic.o
  CC [M]  /tmp/kvm-kmod/x86/ioapic.o
  CC [M]  /tmp/kvm-kmod/x86/preempt.o
  CC [M]  /tmp/kvm-kmod/x86/i8254.o
  CC [M]  /tmp/kvm-kmod/x86/coalesced_mmio.o
  CC [M]  /tmp/kvm-kmod/x86/irq_comm.o
  CC [M]  /tmp/kvm-kmod/x86/timer.o
  CC [M]  /tmp/kvm-kmod/x86/eventfd.o
  CC [M]  /tmp/kvm-kmod/x86/assigned-dev.o
  CC [M]  /tmp/kvm-kmod/x86/../external-module-compat.o
  CC [M]  /tmp/kvm-kmod/x86/../request-irq-compat.o
  CC [M]  /tmp/kvm-kmod/x86/iommu.o
  CC [M]  /tmp/kvm-kmod/x86/../srcu.o
  LD [M]  /tmp/kvm-kmod/x86/kvm.o
  LD [M]  /tmp/kvm-kmod/x86/kvm-intel.o
  LD [M]  /tmp/kvm-kmod/x86/kvm-amd.o
  LD      /tmp/kvm-kmod/built-in.o
  Building modules, stage 2.
  MODPOST 3 modules
 WARNING: vma_kernel_pagesize [/tmp/kvm-kmod/x86/kvm.ko] undefined!
 WARNING: __tracepoint_kvm_mmu_paging_element
 [/tmp/kvm-kmod/x86/kvm.ko] undefined!
 WARNING: __tracepoint_kvm_mmu_sync_page [/tmp/kvm-kmod/x86/kvm.ko]
 undefined!
 WARNING: __tracepoint_kvm_mmio [/tmp/kvm-kmod/x86/kvm.ko] undefined!

 WARNING: __tracepoint_kvm_mmu_set_accessed_bit
 [/tmp/kvm-kmod/x86/kvm.ko] undefined!
 WARNING: __tracepoint_kvm_exit [/tmp/kvm-kmod/x86/kvm.ko] undefined!

 WARNING: __tracepoint_kvm_pio [/tmp/kvm-kmod/x86/kvm.ko] undefined!

 WARNING: __tracepoint_kvm_mmu_zap_page [/tmp/kvm-kmod/x86/kvm.ko]
 undefined!
 WARNING: __tracepoint_kvm_pic_set_irq [/tmp/kvm-kmod/x86/kvm.ko]
 undefined!
 WARNING: __tracepoint_kvm_cpuid [/tmp/kvm-kmod/x86/kvm.ko] undefined!

 WARNING: __tracepoint_kvm_mmu_unsync_page [/tmp/kvm-kmod/x86/kvm.ko]
 undefined!
 WARNING: __tracepoint_kvm_msr [/tmp/kvm-kmod/x86/kvm.ko] undefined!

 WARNING: __tracepoint_kvm_set_irq [/tmp/kvm-kmod/x86/kvm.ko]
 undefined!
 WARNING: __tracepoint_kvm_apic_accept_irq [/tmp/kvm-kmod/x86/kvm.ko]
 undefined!
 WARNING: __tracepoint_kvm_inj_virq [/tmp/kvm-kmod/x86/kvm.ko]
 undefined!
 WARNING: __tracepoint_kvm_hypercall [/tmp/kvm-kmod/x86/kvm.ko]
 undefined!
 WARNING: __tracepoint_kvm_page_fault [/tmp/kvm-kmod/x86/kvm.ko]
 undefined!
 WARNING: __tracepoint_kvm_entry [/tmp/kvm-kmod/x86/kvm.ko] undefined!

 WARNING: __tracepoint_kvm_apic [/tmp/kvm-kmod/x86/kvm.ko] undefined!

 WARNING: __tracepoint_kvm_ioapic_set_irq [/tmp/kvm-kmod/x86/kvm.ko]
 undefined!
 WARNING: __tracepoint_kvm_apic_ipi [/tmp/kvm-kmod/x86/kvm.ko]
 undefined!
 WARNING: __tracepoint_kvm_cr [/tmp/kvm-kmod/x86/kvm.ko] undefined!

 WARNING: __tracepoint_kvm_ack_irq [/tmp/kvm-kmod/x86/kvm.ko]
 undefined!
 WARNING: __tracepoint_kvm_mmu_walker_error [/tmp/kvm-kmod/x86/kvm.ko]
 undefined!
 WARNING: __tracepoint_kvm_mmu_get_page [/tmp/kvm-kmod/x86/kvm.ko]
 undefined!
 WARNING: __tracepoint_kvm_mmu_set_dirty_bit [/tmp/kvm-kmod/x86/kvm.ko]
 undefined!
 WARNING: __tracepoint_kvm_mmu_pagetable_walk
 [/tmp/kvm-kmod/x86/kvm.ko] undefined!
 WARNING: __tracepoint_kvm_msi_set_irq [/tmp/kvm-kmod/x86/kvm.ko]
 undefined!
  CC      /tmp/kvm-kmod/x86/kvm-amd.mod.o

  LD [M]  /tmp/kvm-kmod/x86/kvm-amd.ko

  CC      /tmp/kvm-kmod/x86/kvm-intel.mod.o

  LD [M]  /tmp/kvm-kmod/x86/kvm-intel.ko

  CC      /tmp/kvm-kmod/x86/kvm.mod.o

  LD [M]  /tmp/kvm-kmod/x86/kvm.ko

 make[1]: Leaving directory `/usr/src/linux-2.6.31-gentoo'

 vger kvm-kmod #

Aidan, were you able to solve this? I was having the same (original)
problem in Xubuntu 64-bits with a custom 2.6.31 kernel and kvm-88. I
still haven't tried Jan's patch (paper deadline at work) but I wanted
to know if you had made any progress.

Cheers,
Jorge
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server

2009-09-23 Thread Avi Kivity

On 09/23/2009 08:58 PM, Gregory Haskins wrote:



It also pulls parts of the device model into the host kernel.
   

That is the point.  Most of it needs to be there for performance.
 

To clarify this point:

There are various aspects about designing high-performance virtual
devices such as providing the shortest paths possible between the
physical resources and the consumers.  Conversely, we also need to
ensure that we meet proper isolation/protection guarantees at the same
time.  What this means is there are various aspects to any
high-performance PV design that require to be placed in-kernel to
maximize the performance yet properly isolate the guest.

For instance, you are required to have your signal-path (interrupts and
hypercalls), your memory-path (gpa translation), and
addressing/isolation model in-kernel to maximize performance.
   


Exactly.  That's what vhost puts into the kernel and nothing more.


Vbus accomplishes its in-kernel isolation model by providing a
container concept, where objects are placed into this container by
userspace.  The host kernel enforces isolation/protection by using a
namespace to identify objects that is only relevant within a specific
container's context (namely, a u32 dev-id).  The guest addresses the
objects by its dev-id, and the kernel ensures that the guest can't
access objects outside of its dev-id namespace.
   


vhost manages to accomplish this without any kernel support.  The guest 
simply has not access to any vhost resources other than the guest-host 
doorbell, which is handed to the guest outside vhost (so it's somebody 
else's problem, in userspace).



All that is required is a way to transport a message with a devid
attribute as an address (such as DEVCALL(devid)) and the framework
provides the rest of the decode+execute function.
   


vhost avoids that.


Contrast this to vhost+virtio-pci (called simply vhost from here).
   


It's the wrong name.  vhost implements only the data path.


It is not immune to requiring in-kernel addressing support either, but
rather it just does it differently (and its not as you might expect via
qemu).

Vhost relies on QEMU to render PCI objects to the guest, which the guest
assigns resources (such as BARs, interrupts, etc).


vhost does not rely on qemu.  It relies on its user to handle 
configuration.  In one important case it's qemu+pci.  It could just as 
well be the lguest launcher.



   A PCI-BAR in this
example may represent a PIO address for triggering some operation in the
device-model's fast-path.  For it to have meaning in the fast-path, KVM
has to have in-kernel knowledge of what a PIO-exit is, and what to do
with it (this is where pio-bus and ioeventfd come in).  The programming
of the PIO-exit and the ioeventfd are likewise controlled by some
userspace management entity (i.e. qemu).   The PIO address and value
tuple form the address, and the ioeventfd framework within KVM provide
the decode+execute function.
   


Right.


This idea seemingly works fine, mind you, but it rides on top of a *lot*
of stuff including but not limited to: the guests pci stack, the qemu
pci emulation, kvm pio support, and ioeventfd.  When you get into
situations where you don't have PCI or even KVM underneath you (e.g. a
userspace container, Ira's rig, etc) trying to recreate all of that PCI
infrastructure for the sake of using PCI is, IMO, a lot of overhead for
little gain.
   


For the N+1th time, no.  vhost is perfectly usable without pci.  Can we 
stop raising and debunking this point?



All you really need is a simple decode+execute mechanism, and a way to
program it from userspace control.  vbus tries to do just that:
commoditize it so all you need is the transport of the control messages
(like DEVCALL()), but the decode+execute itself is reuseable, even
across various environments (like KVM or Iras rig).
   


If you think it should be commodotized, write libvhostconfig.so.


And your argument, I believe, is that vbus allows both to be implemented
in the kernel (though to reiterate, its optional) and is therefore a bad
design, so lets discuss that.

I believe the assertion is that things like config-space are best left
to userspace, and we should only relegate fast-path duties to the
kernel.  The problem is that, in my experience, a good deal of
config-space actually influences the fast-path and thus needs to
interact with the fast-path mechanism eventually anyway.
Whats left
over that doesn't fall into this category may cheaply ride on existing
plumbing, so its not like we created something new or unnatural just to
support this subclass of config-space.
   


Flexibility is reduced, because changing code in the kernel is more 
expensive than in userspace, and kernel/user interfaces aren't typically 
as wide as pure userspace interfaces.  Security is reduced, since a bug 
in the kernel affects the host, while a bug in userspace affects just on 
guest.


Example: feature negotiation.  If it happens in 

Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server

2009-09-23 Thread Gregory Haskins
Avi Kivity wrote:
 On 09/23/2009 08:58 PM, Gregory Haskins wrote:

 It also pulls parts of the device model into the host kernel.

 That is the point.  Most of it needs to be there for performance.
  
 To clarify this point:

 There are various aspects about designing high-performance virtual
 devices such as providing the shortest paths possible between the
 physical resources and the consumers.  Conversely, we also need to
 ensure that we meet proper isolation/protection guarantees at the same
 time.  What this means is there are various aspects to any
 high-performance PV design that require to be placed in-kernel to
 maximize the performance yet properly isolate the guest.

 For instance, you are required to have your signal-path (interrupts and
 hypercalls), your memory-path (gpa translation), and
 addressing/isolation model in-kernel to maximize performance.

 
 Exactly.  That's what vhost puts into the kernel and nothing more.

Actually, no.  Generally, _KVM_ puts those things into the kernel, and
vhost consumes them.  Without KVM (or something equivalent), vhost is
incomplete.  One of my goals with vbus is to generalize the something
equivalent part here.

I know you may not care about non-kvm use cases, and thats fine.  No one
says you have to.  However, note that some of use do care about these
non-kvm cases, and thus its a distinction I am making here as a benefit
of the vbus framework.

 
 Vbus accomplishes its in-kernel isolation model by providing a
 container concept, where objects are placed into this container by
 userspace.  The host kernel enforces isolation/protection by using a
 namespace to identify objects that is only relevant within a specific
 container's context (namely, a u32 dev-id).  The guest addresses the
 objects by its dev-id, and the kernel ensures that the guest can't
 access objects outside of its dev-id namespace.

 
 vhost manages to accomplish this without any kernel support.

No, vhost manages to accomplish this because of KVMs kernel support
(ioeventfd, etc).   Without a KVM-like in-kernel support, vhost is a
merely a kind of tuntap-like clone signalled by eventfds.

vbus on the other hand, generalizes one more piece of the puzzle
(namely, the function of pio+ioeventfd and userspace's programming of
it) by presenting the devid namespace and container concept.

This goes directly to my rebuttal of your claim that vbus places too
much in the kernel.  I state that, one way or the other, address decode
and isolation _must_ be in the kernel for performance.  Vbus does this
with a devid/container scheme.  vhost+virtio-pci+kvm does it with
pci+pio+ioeventfd.


  The guest
 simply has not access to any vhost resources other than the guest-host
 doorbell, which is handed to the guest outside vhost (so it's somebody
 else's problem, in userspace).

You mean _controlled_ by userspace, right?  Obviously, the other side of
the kernel still needs to be programmed (ioeventfd, etc).  Otherwise,
vhost would be pointless: e.g. just use vanilla tuntap if you don't need
fast in-kernel decoding.

 
 All that is required is a way to transport a message with a devid
 attribute as an address (such as DEVCALL(devid)) and the framework
 provides the rest of the decode+execute function.

 
 vhost avoids that.

No, it doesn't avoid it.  It just doesn't specify how its done, and
relies on something else to do it on its behalf.

Conversely, vbus specifies how its done, but not how to transport the
verb across the wire.  That is the role of the vbus-connector abstraction.

 
 Contrast this to vhost+virtio-pci (called simply vhost from here).

 
 It's the wrong name.  vhost implements only the data path.

Understood, but vhost+virtio-pci is what I am contrasting, and I use
vhost for short from that point on because I am too lazy to type the
whole name over and over ;)

 
 It is not immune to requiring in-kernel addressing support either, but
 rather it just does it differently (and its not as you might expect via
 qemu).

 Vhost relies on QEMU to render PCI objects to the guest, which the guest
 assigns resources (such as BARs, interrupts, etc).
 
 vhost does not rely on qemu.  It relies on its user to handle
 configuration.  In one important case it's qemu+pci.  It could just as
 well be the lguest launcher.

I meant vhost=vhost+virtio-pci here.  Sorry for the confusion.

The point I am making specifically is that vhost in general relies on
other in-kernel components to function.  I.e. It cannot function without
having something like the PCI model to build an IO namespace.  That
namespace (in this case, pio addresses+data tuples) are used for the
in-kernel addressing function under KVM + virtio-pci.

The case of the lguest launcher is a good one to highlight.  Yes, you
can presumably also use lguest with vhost, if the requisite facilities
are exposed to lguest-bus, and some eventfd based thing like ioeventfd
is written for the host (if it doesnt exist already).

And 

[PATCH] kvm: allocate correct size for dirty bitmap

2009-09-23 Thread Bruce Rogers
The dirty bitmap copied out to userspace is stored in a long array, and gets 
copied out to userspace accordingly.  This patch accounts for that correctly.  
Currently I'm seeing kvm crashing due to writing beyond the end of the alloc'd 
dirty bitmap memory, because the buffer has the wrong size.

Signed-off-by: Bruce Rogers 
---
 qemu-kvm.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qemu-kvm.c b/qemu-kvm.c
index 6511cb6..ee5db76 100644
--- a/qemu-kvm.c
+++ b/qemu-kvm.c
@@ -702,7 +702,7 @@ int kvm_get_dirty_pages_range(kvm_context_t kvm, unsigned 
long phys_addr,
 for (i = 0; i  KVM_MAX_NUM_MEM_REGIONS; ++i) {
 if ((slots[i].len  (uint64_t) slots[i].phys_addr = phys_addr)
  ((uint64_t) slots[i].phys_addr + slots[i].len = end_addr)) {
-buf = qemu_malloc((slots[i].len / 4096 + 7) / 8 + 2);
+buf = qemu_malloc(BITMAP_SIZE(slots[i].len));
 r = kvm_get_map(kvm, KVM_GET_DIRTY_LOG, i, buf);
 if (r) {
 qemu_free(buf);


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH: kvm 1/6] Code motion. Separate timer intialization into an indepedent function.

2009-09-23 Thread Zachary Amsden
Signed-off-by: Zachary Amsden zams...@redhat.com
---
 arch/x86/kvm/x86.c |   23 +++
 1 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fedac9d..15d2ace 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3116,9 +3116,22 @@ static struct notifier_block 
kvmclock_cpufreq_notifier_block = {
 .notifier_call  = kvmclock_cpufreq_notifier
 };
 
+static void kvm_timer_init(void)
+{
+   int cpu;
+
+   for_each_possible_cpu(cpu)
+   per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+   if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+   tsc_khz_ref = tsc_khz;
+   cpufreq_register_notifier(kvmclock_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+   }
+}
+
 int kvm_arch_init(void *opaque)
 {
-   int r, cpu;
+   int r;
struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
 
if (kvm_x86_ops) {
@@ -3150,13 +3163,7 @@ int kvm_arch_init(void *opaque)
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0);
 
-   for_each_possible_cpu(cpu)
-   per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
-   if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
-   tsc_khz_ref = tsc_khz;
-   cpufreq_register_notifier(kvmclock_cpufreq_notifier_block,
- CPUFREQ_TRANSITION_NOTIFIER);
-   }
+   kvm_timer_init();
 
return 0;
 
-- 
1.6.4.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH: kvm 3/6] Fix hotadd of CPUs for KVM.

2009-09-23 Thread Zachary Amsden
Both VMX and SVM require per-cpu memory allocation, which is done at module
init time, for only online cpus.  When bringing a new CPU online, we must
also allocate this structure.  The method chosen to implement this is to
make the CPU online notifier available via a call to the arch code.  This
allows memory allocation to be done smoothly, without any need to allocate
extra structures.

Note: CPU up notifiers may call KVM callback before calling cpufreq callbacks.
This would causes the CPU frequency not to be detected (and it is not always
clear on non-constant TSC platforms what the bringup TSC rate will be, so the
guess of using tsc_khz could be wrong).  So, we clear the rate to zero in such
a case and add logic to query it upon entry.

Signed-off-by: Zachary Amsden zams...@redhat.com
---
 arch/x86/include/asm/kvm_host.h |2 ++
 arch/x86/kvm/svm.c  |   15 +--
 arch/x86/kvm/vmx.c  |   17 +
 arch/x86/kvm/x86.c  |   14 +-
 include/linux/kvm_host.h|6 ++
 virt/kvm/kvm_main.c |3 ++-
 6 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 299cc1b..b7dd14b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -459,6 +459,7 @@ struct descriptor_table {
 struct kvm_x86_ops {
int (*cpu_has_kvm_support)(void);  /* __init */
int (*disabled_by_bios)(void); /* __init */
+   int (*cpu_hotadd)(int cpu);
int (*hardware_enable)(void *dummy);
void (*hardware_disable)(void *dummy);
void (*check_processor_compatibility)(void *rtn);
@@ -791,6 +792,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
_ASM_PTR  666b, 667b \n\t \
.popsection
 
+#define KVM_ARCH_WANT_HOTPLUG_NOTIFIER
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9a4daca..8f99d0c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -330,13 +330,13 @@ static int svm_hardware_enable(void *garbage)
return -EBUSY;
 
if (!has_svm()) {
-   printk(KERN_ERR svm_cpu_init: err EOPNOTSUPP on %d\n, me);
+   printk(KERN_ERR svm_hardware_enable: err EOPNOTSUPP on %d\n, 
me);
return -EINVAL;
}
svm_data = per_cpu(svm_data, me);
 
if (!svm_data) {
-   printk(KERN_ERR svm_cpu_init: svm_data is NULL on %d\n,
+   printk(KERN_ERR svm_hardware_enable: svm_data is NULL on %d\n,
   me);
return -EINVAL;
}
@@ -394,6 +394,16 @@ err_1:
 
 }
 
+static __cpuinit int svm_cpu_hotadd(int cpu)
+{
+   struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
+
+   if (svm_data)
+   return 0;
+
+   return svm_cpu_init(cpu);
+}
+
 static void set_msr_interception(u32 *msrpm, unsigned msr,
 int read, int write)
 {
@@ -2858,6 +2868,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.hardware_setup = svm_hardware_setup,
.hardware_unsetup = svm_hardware_unsetup,
.check_processor_compatibility = svm_check_processor_compat,
+   .cpu_hotadd = svm_cpu_hotadd,
.hardware_enable = svm_hardware_enable,
.hardware_disable = svm_hardware_disable,
.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3fe0d42..b8a8428 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1408,6 +1408,22 @@ static __exit void hardware_unsetup(void)
free_kvm_area();
 }
 
+static __cpuinit int vmx_cpu_hotadd(int cpu)
+{
+   struct vmcs *vmcs;
+
+   if (per_cpu(vmxarea, cpu))
+   return 0;
+
+   vmcs = alloc_vmcs_cpu(cpu);
+   if (!vmcs) 
+   return -ENOMEM;
+
+   per_cpu(vmxarea, cpu) = vmcs;
+
+   return 0;
+}
+
 static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
 {
struct kvm_vmx_segment_field *sf = kvm_vmx_segment_fields[seg];
@@ -3925,6 +3941,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.hardware_setup = hardware_setup,
.hardware_unsetup = hardware_unsetup,
.check_processor_compatibility = vmx_check_processor_compat,
+   .cpu_hotadd = vmx_cpu_hotadd,
.hardware_enable = hardware_enable,
.hardware_disable = hardware_disable,
.cpu_has_accelerated_tpr = report_flexpriority,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 35082dd..05aea42 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1339,6 +1339,8 @@ out:
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
kvm_x86_ops-vcpu_load(vcpu, cpu);
+   if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0))
+   

[PATCH: kvm 4/6] Fix hotremove of CPUs for KVM.

2009-09-23 Thread Zachary Amsden
In the process of bringing down CPUs, the SVM / VMX structures associated
with those CPUs are not freed.  This may cause leaks when unloading and
reloading the KVM module, as only the structures associated with online
CPUs are cleaned up.

Signed-off-by: Zachary Amsden zams...@redhat.com
---
 arch/x86/include/asm/kvm_host.h |1 +
 arch/x86/kvm/svm.c  |1 +
 arch/x86/kvm/vmx.c  |7 +++
 arch/x86/kvm/x86.c  |5 +
 include/linux/kvm_host.h|2 ++
 virt/kvm/kvm_main.c |2 ++
 6 files changed, 18 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b7dd14b..91e02d3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -460,6 +460,7 @@ struct kvm_x86_ops {
int (*cpu_has_kvm_support)(void);  /* __init */
int (*disabled_by_bios)(void); /* __init */
int (*cpu_hotadd)(int cpu);
+   void (*cpu_hotremove)(int cpu);
int (*hardware_enable)(void *dummy);
void (*hardware_disable)(void *dummy);
void (*check_processor_compatibility)(void *rtn);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8f99d0c..7cf6c98 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2869,6 +2869,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.hardware_unsetup = svm_hardware_unsetup,
.check_processor_compatibility = svm_check_processor_compat,
.cpu_hotadd = svm_cpu_hotadd,
+   .cpu_hotremove = svm_cpu_uninit,
.hardware_enable = svm_hardware_enable,
.hardware_disable = svm_hardware_disable,
.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b8a8428..1e3e9fc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1424,6 +1424,12 @@ static __cpuinit int vmx_cpu_hotadd(int cpu)
return 0;
 }
 
+static __cpuinit void vmx_cpu_hotremove(int cpu)
+{
+   free_vmcs(per_cpu(vmxarea, cpu));
+   per_cpu(vmxarea, cpu) = NULL;
+}
+
 static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
 {
struct kvm_vmx_segment_field *sf = kvm_vmx_segment_fields[seg];
@@ -3942,6 +3948,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.hardware_unsetup = hardware_unsetup,
.check_processor_compatibility = vmx_check_processor_compat,
.cpu_hotadd = vmx_cpu_hotadd,
+   .cpu_hotremove = vmx_cpu_hotremove,
.hardware_enable = hardware_enable,
.hardware_disable = hardware_disable,
.cpu_has_accelerated_tpr = report_flexpriority,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 05aea42..38ba4a6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4719,6 +4719,11 @@ int kvm_arch_cpu_hotadd(int cpu)
return kvm_x86_ops-cpu_hotadd(cpu);
 }
 
+void kvm_arch_cpu_hotremove(int cpu)
+{
+   kvm_x86_ops-cpu_hotremove(cpu);
+}
+
 int kvm_arch_hardware_enable(void *garbage)
 {
/*
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2f075c4..2c844f0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -347,8 +347,10 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
 
 #ifdef KVM_ARCH_WANT_HOTPLUG_NOTIFIER
 int kvm_arch_cpu_hotadd(int cpu);
+void kvm_arch_cpu_hotremove(int cpu);
 #else
 #define kvm_arch_cpu_hotadd(x) (0)
+#define kvm_arch_cpu_hotremove(x)
 #endif
 
 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1360db4..bd21927 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1725,11 +1725,13 @@ static int kvm_cpu_hotplug(struct notifier_block 
*notifier, unsigned long val,
printk(KERN_INFO kvm: disabling virtualization on CPU%d\n,
   cpu);
hardware_disable(NULL);
+   kvm_arch_cpu_hotremove(cpu);
break;
case CPU_UP_CANCELED:
printk(KERN_INFO kvm: disabling virtualization on CPU%d\n,
   cpu);
smp_call_function_single(cpu, hardware_disable, NULL, 1);
+   kvm_arch_cpu_hotremove(cpu);
break;
case CPU_ONLINE:
printk(KERN_INFO kvm: enabling virtualization on CPU%d\n,
-- 
1.6.4.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH: kvm 5/6] Don't unconditionally clear cpu_khz_tsc in hardware_enable.

2009-09-23 Thread Zachary Amsden
For the non-hotplug case, this TSC speed should be available; instead, clear
cpu_khz_tsc when bringing down a CPU so it is recomputed at the next bringup.

Signed-off-by: Zachary Amsden zams...@redhat.com
---
 arch/x86/kvm/x86.c |   12 +---
 1 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 38ba4a6..f1470ce 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4721,18 +4721,16 @@ int kvm_arch_cpu_hotadd(int cpu)
 
 void kvm_arch_cpu_hotremove(int cpu)
 {
+   /*
+* Reset TSC khz to zero so it is recomputed on bringup
+*/
+   if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+   per_cpu(cpu_tsc_khz, cpu) = 0;
kvm_x86_ops-cpu_hotremove(cpu);
 }
 
 int kvm_arch_hardware_enable(void *garbage)
 {
-   /*
-* Notifier callback chain may not have called cpufreq code
-* yet, thus we must reset TSC khz to zero and recompute it
-* before entering.
-*/
-   if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
-   per_cpu(cpu_tsc_khz, raw_smp_processor_id()) = 0;
return kvm_x86_ops-hardware_enable(garbage);
 }
 
-- 
1.6.4.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH: kvm 6/6] Math is hard; let's do some cooking.

2009-09-23 Thread Zachary Amsden
CPU frequency change callback provides new TSC frequency for us, and in the
same units (kHz), so there is no reason to do any math.

Signed-off-by: Zachary Amsden zams...@redhat.com
---
 arch/x86/kvm/x86.c |5 +
 1 files changed, 1 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f1470ce..c849f8f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3083,15 +3083,12 @@ static int kvmclock_cpufreq_notifier(struct 
notifier_block *nb, unsigned long va
struct kvm *kvm;
struct kvm_vcpu *vcpu;
int i, send_ipi = 0;
-   unsigned long old_khz;
 
if (val == CPUFREQ_PRECHANGE  freq-old  freq-new)
return 0;
if (val == CPUFREQ_POSTCHANGE  freq-old  freq-new)
return 0;
-   old_khz = per_cpu(cpu_tsc_khz, freq-cpu);
-   per_cpu(cpu_tsc_khz, freq-cpu) = cpufreq_scale(old_khz, freq-old,
-   freq-new);
+   per_cpu(cpu_tsc_khz, freq-cpu) = freq-new;
 
spin_lock(kvm_lock);
list_for_each_entry(kvm, vm_list, vm_list) {
-- 
1.6.4.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH: kvm 2/6] Kill the confusing tsc_ref_khz and ref_freq variables.

2009-09-23 Thread Zachary Amsden
They are globals, not clearly protected by any ordering or locking, and
vulnerable to various startup races.

Instead, for variable TSC machines, register the cpufreq notifier and get
the TSC frequency directly from the cpufreq machinery.  Not only is it
always right, it is also perfectly accurate, as no error prone measurement
is required.  On such machines, also detect the frequency when bringing
a new CPU online; it isn't clear what frequency it will start with, and
it may not correspond to the reference.

Signed-off-by: Zachary Amsden zams...@redhat.com
---
 arch/x86/kvm/x86.c |   38 --
 1 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 15d2ace..35082dd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -650,6 +650,19 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct 
pvclock_vcpu_time_info *
 
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 
+static inline void kvm_get_cpu_khz(int cpu)
+{
+   unsigned int khz = cpufreq_get(cpu);
+   if (khz = 0) {
+   static int warn = 0;
+   if (warn++  10)
+   printk(KERN_ERR kvm: could not get frequency for CPU 
+   %d.  Timers may be inaccurate\n, cpu);
+   khz = tsc_khz;
+   }
+   per_cpu(cpu_tsc_khz, cpu) = khz;
+}
+
 static void kvm_write_guest_time(struct kvm_vcpu *v)
 {
struct timespec ts;
@@ -3061,9 +3074,6 @@ static void bounce_off(void *info)
/* nothing */
 }
 
-static unsigned int  ref_freq;
-static unsigned long tsc_khz_ref;
-
 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long 
val,
 void *data)
 {
@@ -3071,15 +3081,15 @@ static int kvmclock_cpufreq_notifier(struct 
notifier_block *nb, unsigned long va
struct kvm *kvm;
struct kvm_vcpu *vcpu;
int i, send_ipi = 0;
-
-   if (!ref_freq)
-   ref_freq = freq-old;
+   unsigned long old_khz;
 
if (val == CPUFREQ_PRECHANGE  freq-old  freq-new)
return 0;
if (val == CPUFREQ_POSTCHANGE  freq-old  freq-new)
return 0;
-   per_cpu(cpu_tsc_khz, freq-cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, 
freq-new);
+   old_khz = per_cpu(cpu_tsc_khz, freq-cpu);
+   per_cpu(cpu_tsc_khz, freq-cpu) = cpufreq_scale(old_khz, freq-old,
+   freq-new);
 
spin_lock(kvm_lock);
list_for_each_entry(kvm, vm_list, vm_list) {
@@ -3120,12 +3130,18 @@ static void kvm_timer_init(void)
 {
int cpu;
 
-   for_each_possible_cpu(cpu)
-   per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
-   tsc_khz_ref = tsc_khz;
cpufreq_register_notifier(kvmclock_cpufreq_notifier_block,
  CPUFREQ_TRANSITION_NOTIFIER);
+   for_each_online_cpu(cpu)
+   kvm_get_cpu_khz(cpu);
+   } else {
+   for_each_possible_cpu(cpu)
+   per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+   }
+   for_each_possible_cpu(cpu) {
+   printk(KERN_DEBUG kvm: cpu %d = %ld khz\n,
+   cpu, per_cpu(cpu_tsc_khz, cpu));
}
 }
 
@@ -4698,6 +4714,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 
 int kvm_arch_hardware_enable(void *garbage)
 {
+   if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+   kvm_get_cpu_khz(raw_smp_processor_id());
return kvm_x86_ops-hardware_enable(garbage);
 }
 
-- 
1.6.4.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


linux-next: tree build failure

2009-09-23 Thread Stephen Rothwell
Hi all,

Today's linux-next build (powerpc ppc44x_defconfig) failed like this:

In file included from arch/powerpc/kvm/booke.c:31:
arch/powerpc/kvm/timing.h: In function 'kvmppc_account_exit_stat':
arch/powerpc/kvm/timing.h:51: error: bit-field 'anonymous' width not an 
integer constant
In file included from arch/powerpc/kvm/booke.h:26,
 from arch/powerpc/kvm/booke_emulate.c:23:
arch/powerpc/kvm/timing.h: In function 'kvmppc_account_exit_stat':
arch/powerpc/kvm/timing.h:51: error: bit-field 'anonymous' width not an 
integer constant

Presumably caused by commit 8c87df457cb58fe75b9b893007917cf8095660a0
(BUILD_BUG_ON(): fix it and a couple of bogus uses of it).

I applied the following patch for today.  This inline function is
only called from one place in one file ...

From: Stephen Rothwell s...@canb.auug.org.au
Date: Thu, 24 Sep 2009 15:13:20 +1000
Subject: [PATCH] powerpc/kvm: build fix for new BUILD_BUG_ON

Signed-off-by: Stephen Rothwell s...@canb.auug.org.au
---
 arch/powerpc/kvm/timing.h |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/kvm/timing.h b/arch/powerpc/kvm/timing.h
index bb13b1f..4c34099 100644
--- a/arch/powerpc/kvm/timing.h
+++ b/arch/powerpc/kvm/timing.h
@@ -48,7 +48,7 @@ static inline void kvmppc_set_exit_type(struct kvm_vcpu 
*vcpu, int type) {}
 static inline void kvmppc_account_exit_stat(struct kvm_vcpu *vcpu, int type)
 {
/* type has to be known at build time for optimization */
-   BUILD_BUG_ON(__builtin_constant_p(type));
+   //BUILD_BUG_ON(__builtin_constant_p(type));
switch (type) {
case EXT_INTR_EXITS:
vcpu-stat.ext_intr_exits++;
-- 
1.6.4.3


-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au
--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html