[GIT PULL] Second batch of KVM changes for 4.1

2015-04-23 Thread Paolo Bonzini
Linus,

The following changes since commit b79013b2449c23f1f505bdf39c5a6c330338b244:

  Merge tag 'staging-4.1-rc1' of 
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging (2015-04-13 
17:37:33 -0700)

are available in the git repository at:


  git://git.kernel.org/pub/scm/virt/kvm/kvm.git tags/for-linus

for you to fetch changes up to 2fa462f826210bbec65f8ed06d5ef4e0cd4f5450:

  Merge tag 'kvm-arm-for-4.1-take2' of 
git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into kvm-master 
(2015-04-22 17:08:12 +0200)



This mostly includes the PPC changes for 4.1, which this time cover
Book3S HV only (debugging aids, minor performance improvements and some
cleanups).  But there are also bug fixes and small cleanups for ARM,
x86 and s390.

The task_migration_notifier revert and real fix is still pending review,
but I'll send it as soon as possible after -rc1.


Andre Przywara (1):
  KVM: arm/arm64: check IRQ number on userland injection

Aneesh Kumar K.V (2):
  KVM: PPC: Book3S HV: Remove RMA-related variables from code
  KVM: PPC: Book3S HV: Add helpers for lock/unlock hpte

Ben Serebrin (1):
  KVM: VMX: Preserve host CR4.MCE value while in guest mode.

Christian Borntraeger (1):
  KVM: s390: disable RRBM again

David Gibson (1):
  kvmppc: Implement H_LOGICAL_CI_{LOAD,STORE} in KVM

Eric Auger (1):
  KVM: arm: irqfd: fix value returned by kvm_irq_map_gsi

Michael Ellerman (1):
  KVM: PPC: Book3S HV: Add fast real-mode H_RANDOM implementation.

Nadav Amit (1):
  KVM: x86: Fix MSR_IA32_BNDCFGS in msrs_to_save

Paolo Bonzini (3):
  KVM: x86: cleanup kvm_irq_delivery_to_apic_fast
  Merge tag 'signed-kvm-ppc-queue' of git://github.com/agraf/linux-2.6 into 
kvm-master
  Merge tag 'kvm-arm-for-4.1-take2' of 
git://git.kernel.org/.../kvmarm/kvmarm into kvm-master

Paul Mackerras (12):
  KVM: PPC: Book3S HV: Create debugfs file for each guest's HPT
  KVM: PPC: Book3S HV: Accumulate timing information for real-mode code
  KVM: PPC: Book3S HV: Simplify handling of VCPUs that need a VPA update
  KVM: PPC: Book3S HV: Minor cleanups
  KVM: PPC: Book3S HV: Move vcore preemption point up into kvmppc_run_vcpu
  KVM: PPC: Book3S HV: Get rid of vcore nap_count and n_woken
  KVM: PPC: Book3S HV: Don't wake thread with no vcpu on guest IPI
  KVM: PPC: Book3S HV: Use decrementer to wake napping threads
  KVM: PPC: Book3S HV: Use bitmap of active threads rather than count
  KVM: PPC: Book3S HV: Streamline guest entry and exit
  KVM: PPC: Book3S HV: Translate kvmhv_commence_exit to C
  KVM: PPC: Book3S HV: Use msgsnd for signalling threads on POWER8

Suresh E. Warrier (2):
  powerpc: Export __spin_yield
  KVM: PPC: Book3S HV: Add guest-host real mode completion counters

Suresh Warrier (3):
  KVM: PPC: Book3S HV: Convert ICS mutex lock to spin lock
  KVM: PPC: Book3S HV: Move virtual mode ICP functions to real-mode
  KVM: PPC: Book3S HV: Add ICP real mode counters

Wanpeng Li (1):
  kvm: mmu: don't do memslot overflow check

Xiao Guangrong (1):
  KVM: MMU: fix comment in kvm_mmu_zap_collapsible_spte

 Documentation/virtual/kvm/api.txt|  17 +
 arch/arm/include/uapi/asm/kvm.h  |   8 +-
 arch/arm/kvm/arm.c   |   3 +-
 arch/arm64/include/uapi/asm/kvm.h|   8 +-
 arch/powerpc/include/asm/archrandom.h|  11 +-
 arch/powerpc/include/asm/kvm_book3s.h|   3 +
 arch/powerpc/include/asm/kvm_book3s_64.h |  18 +
 arch/powerpc/include/asm/kvm_host.h  |  47 ++-
 arch/powerpc/include/asm/kvm_ppc.h   |   2 +
 arch/powerpc/include/asm/time.h  |   3 +
 arch/powerpc/kernel/asm-offsets.c|  20 +-
 arch/powerpc/kernel/time.c   |   6 +
 arch/powerpc/kvm/Kconfig |  14 +
 arch/powerpc/kvm/book3s.c|  76 +
 arch/powerpc/kvm/book3s_64_mmu_hv.c  | 189 +--
 arch/powerpc/kvm/book3s_hv.c | 435 ++--
 arch/powerpc/kvm/book3s_hv_builtin.c | 100 +-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c  |  25 +-
 arch/powerpc/kvm/book3s_hv_rm_xics.c | 238 +++--
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  | 559 +++
 arch/powerpc/kvm/book3s_pr_papr.c|  28 ++
 arch/powerpc/kvm/book3s_xics.c   | 105 --
 arch/powerpc/kvm/book3s_xics.h   |  13 +-
 arch/powerpc/kvm/powerpc.c   |   3 +
 arch/powerpc/lib/locks.c |   1 +
 arch/powerpc/platforms/powernv/rng.c |  29 ++
 arch/s390/kvm/kvm-s390.c |   2 +-
 arch/x86/kvm/lapic.c |  11 +-
 arch/x86/kvm/mmu.c   |  20 +-
 arch/x86/kvm/vmx.c   |  12 +-
 arch/x86/kvm/x86.c   |  10 +-
 include/uapi/linux/kvm.h |   

Re: [PATCH v2 06/10] KVM: arm64: guest debug, add SW break point support

2015-04-23 Thread Alex Bennée

Christoffer Dall christoffer.d...@linaro.org writes:

 On Tue, Mar 31, 2015 at 04:08:04PM +0100, Alex Bennée wrote:
 This adds support for SW breakpoints inserted by userspace.
 
 We do this by trapping all BKPT exceptions in the
 hypervisor (MDCR_EL2_TDE).

 you mean trapping all exceptions in the guest to the hypervisor?

 The kvm_debug_exit_arch carries the address
 of the exception.

 why?  can userspace not simply read out the PC using GET_ONE_REG?

Yes, I have re-worded and removed PC from the debug information.

snip
  
 +/* Trap breakpoints? */
 +if (vcpu-guest_debug  KVM_GUESTDBG_USE_SW_BP)
 +vcpu-arch.mdcr_el2 |= MDCR_EL2_TDE;
 +else
 +vcpu-arch.mdcr_el2 = ~MDCR_EL2_TDE;

 so now you're trapping all debug exceptions, right?

 what happens if the guest is using the hardware to debug debug stuff and
 generates other kinds of debug exceptions, like a hardware breakpoint,
 will we not see an unhandled exception and the guest being forcefully
 killed?

Yes until the later patches which stop the guest using HW debug
registers while we are using them.


 +
  }
  
  void kvm_arch_clear_debug(struct kvm_vcpu *vcpu)
 diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
 index 524fa25..ed1bbb4 100644
 --- a/arch/arm64/kvm/handle_exit.c
 +++ b/arch/arm64/kvm/handle_exit.c
 @@ -82,6 +82,37 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct 
 kvm_run *run)
  return 1;
  }
  
 +/**
 + * kvm_handle_debug_exception - handle a debug exception instruction

 handle a software breadkpoint exception

 + *
 + * @vcpu:   the vcpu pointer
 + * @run:access to the kvm_run structure for results
 + *
 + * We route all debug exceptions through the same handler as we

 all debug exceptions?  software breakpoints and all?  then why the above
 shot text?

 + * just need to report the PC and the HSR values to userspace.
 + * Userspace may decide to re-inject the exception and deliver it to
 + * the guest if it wasn't for the host to deal with.

 now I'm confused - does userspace setup the guest to receive an
 exception or does it tell KVM to emulate an exception for the guest or
 do we execute the breakpoint without trapping the debug exception?

I've made it all go through userspace as we may have to translate the
hypervisor visible exception code to what the guest was expecting to see.


 + */
 +static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu, struct kvm_run 
 *run)
 +{
 +u32 hsr = kvm_vcpu_get_hsr(vcpu);
 +
 +run-exit_reason = KVM_EXIT_DEBUG;
 +run-debug.arch.hsr = hsr;
 +
 +switch (hsr  ESR_ELx_EC_SHIFT) {
 +case ESR_ELx_EC_BKPT32:
 +case ESR_ELx_EC_BRK64:
 +run-debug.arch.pc = *vcpu_pc(vcpu);
 +break;
 +default:
 +kvm_err(%s: un-handled case hsr: %#08x\n,
 +__func__, (unsigned int) hsr);

 this should never happen right?

At the moment it could, at the end of the patch series we should cover
all the cases so it would indicate a bug. I've made it return an error
code so it fails hard as suggested by David.

-- 
Alex Bennée
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm: x86: fix kvmclock update protocol

2015-04-23 Thread Radim Krčmář
2015-04-23 13:46+0200, Paolo Bonzini:
 From: Radim Krčmář rkrc...@redhat.com
 
 The kvmclock spec says that the host will increment a version field to
 an odd number, then update stuff, then increment it to an even number.
 The host is buggy and doesn't do this, and the result is observable
 when one vcpu reads another vcpu's kvmclock data.
 
 There's no good way for a guest kernel to keep its vdso from reading
 a different vcpu's kvmclock data, but we don't need to care about
 changing VCPUs as long as we read a consistent data from kvmclock.
 (VCPU can change outside of this loop too, so it doesn't matter if we
 return a value not fit for this VCPU.)
 
 Based on a patch by Radim Krčmář.
 
 Signed-off-by: Paolo Bonzini pbonz...@redhat.com
 ---

Nice,

Reviewed-by: Radim Krčmář rkrc...@redhat.com
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 2/8] tun: add tun_is_little_endian() helper

2015-04-23 Thread Greg Kurz
Signed-off-by: Greg Kurz gk...@linux.vnet.ibm.com
---
 drivers/net/tun.c |9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 857dca4..3c3d6c0 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -206,14 +206,19 @@ struct tun_struct {
u32 flow_count;
 };
 
+static inline bool tun_is_little_endian(struct tun_struct *tun)
+{
+   return tun-flags  TUN_VNET_LE;
+}
+
 static inline u16 tun16_to_cpu(struct tun_struct *tun, __virtio16 val)
 {
-   return __virtio16_to_cpu(tun-flags  TUN_VNET_LE, val);
+   return __virtio16_to_cpu(tun_is_little_endian(tun), val);
 }
 
 static inline __virtio16 cpu_to_tun16(struct tun_struct *tun, u16 val)
 {
-   return __cpu_to_virtio16(tun-flags  TUN_VNET_LE, val);
+   return __cpu_to_virtio16(tun_is_little_endian(tun), val);
 }
 
 static inline u32 tun_hashfn(u32 rxhash)

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 3/8] macvtap: introduce macvtap_is_little_endian() helper

2015-04-23 Thread Greg Kurz
Signed-off-by: Greg Kurz gk...@linux.vnet.ibm.com
---
 drivers/net/macvtap.c |9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 27ecc5c..a2f2958 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -49,14 +49,19 @@ struct macvtap_queue {
 
 #define MACVTAP_VNET_LE 0x8000
 
+static inline bool macvtap_is_little_endian(struct macvtap_queue *q)
+{
+   return q-flags  MACVTAP_VNET_LE;
+}
+
 static inline u16 macvtap16_to_cpu(struct macvtap_queue *q, __virtio16 val)
 {
-   return __virtio16_to_cpu(q-flags  MACVTAP_VNET_LE, val);
+   return __virtio16_to_cpu(macvtap_is_little_endian(q), val);
 }
 
 static inline __virtio16 cpu_to_macvtap16(struct macvtap_queue *q, u16 val)
 {
-   return __cpu_to_virtio16(q-flags  MACVTAP_VNET_LE, val);
+   return __cpu_to_virtio16(macvtap_is_little_endian(q), val);
 }
 
 static struct proto macvtap_proto = {

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 4/8] vringh: introduce vringh_is_little_endian() helper

2015-04-23 Thread Greg Kurz
Signed-off-by: Greg Kurz gk...@linux.vnet.ibm.com
---
 include/linux/vringh.h |   17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/include/linux/vringh.h b/include/linux/vringh.h
index a3fa537..3ed62ef 100644
--- a/include/linux/vringh.h
+++ b/include/linux/vringh.h
@@ -226,33 +226,38 @@ static inline void vringh_notify(struct vringh *vrh)
vrh-notify(vrh);
 }
 
+static inline bool vringh_is_little_endian(const struct vringh *vrh)
+{
+   return vrh-little_endian;
+}
+
 static inline u16 vringh16_to_cpu(const struct vringh *vrh, __virtio16 val)
 {
-   return __virtio16_to_cpu(vrh-little_endian, val);
+   return __virtio16_to_cpu(vringh_is_little_endian(vrh), val);
 }
 
 static inline __virtio16 cpu_to_vringh16(const struct vringh *vrh, u16 val)
 {
-   return __cpu_to_virtio16(vrh-little_endian, val);
+   return __cpu_to_virtio16(vringh_is_little_endian(vrh), val);
 }
 
 static inline u32 vringh32_to_cpu(const struct vringh *vrh, __virtio32 val)
 {
-   return __virtio32_to_cpu(vrh-little_endian, val);
+   return __virtio32_to_cpu(vringh_is_little_endian(vrh), val);
 }
 
 static inline __virtio32 cpu_to_vringh32(const struct vringh *vrh, u32 val)
 {
-   return __cpu_to_virtio32(vrh-little_endian, val);
+   return __cpu_to_virtio32(vringh_is_little_endian(vrh), val);
 }
 
 static inline u64 vringh64_to_cpu(const struct vringh *vrh, __virtio64 val)
 {
-   return __virtio64_to_cpu(vrh-little_endian, val);
+   return __virtio64_to_cpu(vringh_is_little_endian(vrh), val);
 }
 
 static inline __virtio64 cpu_to_vringh64(const struct vringh *vrh, u64 val)
 {
-   return __cpu_to_virtio64(vrh-little_endian, val);
+   return __cpu_to_virtio64(vringh_is_little_endian(vrh), val);
 }
 #endif /* _LINUX_VRINGH_H */

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 5/8] vhost: introduce vhost_is_little_endian() helper

2015-04-23 Thread Greg Kurz
Signed-off-by: Greg Kurz gk...@linux.vnet.ibm.com
---
 drivers/vhost/vhost.h |   17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 8c1c792..6a49960 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -173,34 +173,39 @@ static inline bool vhost_has_feature(struct 
vhost_virtqueue *vq, int bit)
return vq-acked_features  (1ULL  bit);
 }
 
+static inline bool vhost_is_little_endian(struct vhost_virtqueue *vq)
+{
+   return vhost_has_feature(vq, VIRTIO_F_VERSION_1);
+}
+
 /* Memory accessors */
 static inline u16 vhost16_to_cpu(struct vhost_virtqueue *vq, __virtio16 val)
 {
-   return __virtio16_to_cpu(vhost_has_feature(vq, VIRTIO_F_VERSION_1), 
val);
+   return __virtio16_to_cpu(vhost_is_little_endian(vq), val);
 }
 
 static inline __virtio16 cpu_to_vhost16(struct vhost_virtqueue *vq, u16 val)
 {
-   return __cpu_to_virtio16(vhost_has_feature(vq, VIRTIO_F_VERSION_1), 
val);
+   return __cpu_to_virtio16(vhost_is_little_endian(vq), val);
 }
 
 static inline u32 vhost32_to_cpu(struct vhost_virtqueue *vq, __virtio32 val)
 {
-   return __virtio32_to_cpu(vhost_has_feature(vq, VIRTIO_F_VERSION_1), 
val);
+   return __virtio32_to_cpu(vhost_is_little_endian(vq), val);
 }
 
 static inline __virtio32 cpu_to_vhost32(struct vhost_virtqueue *vq, u32 val)
 {
-   return __cpu_to_virtio32(vhost_has_feature(vq, VIRTIO_F_VERSION_1), 
val);
+   return __cpu_to_virtio32(vhost_is_little_endian(vq), val);
 }
 
 static inline u64 vhost64_to_cpu(struct vhost_virtqueue *vq, __virtio64 val)
 {
-   return __virtio64_to_cpu(vhost_has_feature(vq, VIRTIO_F_VERSION_1), 
val);
+   return __virtio64_to_cpu(vhost_is_little_endian(vq), val);
 }
 
 static inline __virtio64 cpu_to_vhost64(struct vhost_virtqueue *vq, u64 val)
 {
-   return __cpu_to_virtio64(vhost_has_feature(vq, VIRTIO_F_VERSION_1), 
val);
+   return __cpu_to_virtio64(vhost_is_little_endian(vq), val);
 }
 #endif

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [v6] kvm/fpu: Enable fully eager restore kvm FPU

2015-04-23 Thread Dave Hansen
On 04/23/2015 02:13 PM, Liang Li wrote:
 When compiling kernel on westmere, the performance of eager FPU
 is about 0.4% faster than lazy FPU.

Do you have an theory why this is?  What does the regression come from?


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 1/8] virtio: introduce virtio_is_little_endian() helper

2015-04-23 Thread Greg Kurz
Signed-off-by: Greg Kurz gk...@linux.vnet.ibm.com
---
 include/linux/virtio_config.h |   17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index ca3ed78..bd1a582 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -205,35 +205,40 @@ int virtqueue_set_affinity(struct virtqueue *vq, int cpu)
return 0;
 }
 
+static inline bool virtio_is_little_endian(struct virtio_device *vdev)
+{
+   return virtio_has_feature(vdev, VIRTIO_F_VERSION_1);
+}
+
 /* Memory accessors */
 static inline u16 virtio16_to_cpu(struct virtio_device *vdev, __virtio16 val)
 {
-   return __virtio16_to_cpu(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), 
val);
+   return __virtio16_to_cpu(virtio_is_little_endian(vdev), val);
 }
 
 static inline __virtio16 cpu_to_virtio16(struct virtio_device *vdev, u16 val)
 {
-   return __cpu_to_virtio16(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), 
val);
+   return __cpu_to_virtio16(virtio_is_little_endian(vdev), val);
 }
 
 static inline u32 virtio32_to_cpu(struct virtio_device *vdev, __virtio32 val)
 {
-   return __virtio32_to_cpu(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), 
val);
+   return __virtio32_to_cpu(virtio_is_little_endian(vdev), val);
 }
 
 static inline __virtio32 cpu_to_virtio32(struct virtio_device *vdev, u32 val)
 {
-   return __cpu_to_virtio32(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), 
val);
+   return __cpu_to_virtio32(virtio_is_little_endian(vdev), val);
 }
 
 static inline u64 virtio64_to_cpu(struct virtio_device *vdev, __virtio64 val)
 {
-   return __virtio64_to_cpu(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), 
val);
+   return __virtio64_to_cpu(virtio_is_little_endian(vdev), val);
 }
 
 static inline __virtio64 cpu_to_virtio64(struct virtio_device *vdev, u64 val)
 {
-   return __cpu_to_virtio64(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), 
val);
+   return __cpu_to_virtio64(virtio_is_little_endian(vdev), val);
 }
 
 /* Config space accessors. */

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 0/8] vhost: support for cross endian guests

2015-04-23 Thread Greg Kurz
Hi,

This patchset allows vhost to be used with legacy virtio when guest and host
have a different endianness. It is compatible with modern virtio and can be
fully compiled out through kernel config.

FWIW, I could flawlessly kexec/reboot guests from ppc64 to ppc64le and back.
I could also migrate from a ppc64 to a ppc64le host and back. No regressions
on x86 as expected. My experimental QEMU tree is here:

https://github.com/gkurz/qemu.git vhost/cross-endian

I'd be glad if this series could make it to 4.1.

Cheers.

---

Greg Kurz (8):
  virtio: introduce virtio_is_little_endian() helper
  tun: add tun_is_little_endian() helper
  macvtap: introduce macvtap_is_little_endian() helper
  vringh: introduce vringh_is_little_endian() helper
  vhost: introduce vhost_is_little_endian() helper
  virtio: add explicit big-endian support to memory accessors
  vhost: cross-endian support for legacy devices
  macvtap/tun: cross-endian support for little-endian hosts


 drivers/net/Kconfig  |   14 ++
 drivers/net/macvtap.c|   68 +-
 drivers/net/tun.c|   70 ++-
 drivers/vhost/Kconfig|   15 +++
 drivers/vhost/vhost.c|   86 ++
 drivers/vhost/vhost.h|   25 ---
 include/linux/virtio_byteorder.h |   24 ++-
 include/linux/virtio_config.h|   20 ++---
 include/linux/vringh.h   |   17 +---
 include/uapi/linux/if_tun.h  |6 +++
 include/uapi/linux/vhost.h   |   12 +
 11 files changed, 324 insertions(+), 33 deletions(-)

--
Greg

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kvm,x86: load guest FPU context more eagerly

2015-04-23 Thread Rik van Riel
Currently KVM will clear the FPU bits in CR0.TS in the VMCS, and trap to 
re-load them every time the guest accesses the FPU after a switch back into
the guest from the host.

This patch copies the x86 task switch semantics for FPU loading, with the
FPU loaded eagerly after first use if the system uses eager fpu mode,
or if the guest uses the FPU frequently.

In the latter case, after loading the FPU for 255 times, the fpu_counter
will roll over, and we will revert to loading the FPU on demand, until
it has been established that the guest is still actively using the FPU.

This mirrors the x86 task switch policy, which seems to work.

Signed-off-by: Rik van Riel r...@redhat.com
---
I still hope to put the larger FPU changes in at some point, but with
all the current changes to the FPU code I am somewhat uncomfortable
causing even more churn. After 4.1 I may send in the changes to defer
loading of user space FPU context to do_notify_resume() - unless people
want them sooner.

 arch/x86/kvm/x86.c   | 15 +--
 include/linux/kvm_host.h |  1 +
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e1a81267f3f6..2cdb2472a633 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7031,14 +7031,25 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
kvm_put_guest_xcr0(vcpu);
 
-   if (!vcpu-guest_fpu_loaded)
+   if (!vcpu-guest_fpu_loaded) {
+   vcpu-fpu_counter = 0;
return;
+   }
 
vcpu-guest_fpu_loaded = 0;
fpu_save_init(vcpu-arch.guest_fpu);
__kernel_fpu_end();
++vcpu-stat.fpu_reload;
-   kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
+   /*
+* If using eager FPU mode, or if the guest is a frequent user
+* of the FPU, just leave the FPU active for next time.
+* Every 255 times fpu_counter rolls over to 0; a guest that uses
+* the FPU in bursts will revert to loading it on demand.
+*/
+   if (!use_eager_fpu()) {
+   if (++vcpu-fpu_counter  5)
+   kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
+   }
trace_kvm_fpu(0);
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ad45054309a0..f197ad3f6316 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -230,6 +230,7 @@ struct kvm_vcpu {
 
int fpu_active;
int guest_fpu_loaded, guest_xcr0_loaded;
+   unsigned char fpu_counter;
wait_queue_head_t wq;
struct pid *pid;
int sigset_active;

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 6/8] virtio: add explicit big-endian support to memory accessors

2015-04-23 Thread Greg Kurz
The current memory accessors logic is:
- little endian if little_endian
- native endian (i.e. no byteswap) if !little_endian

If we want to fully support cross-endian vhost, we also need to be
able to convert to big endian.

Instead of changing the little_endian argument to some 3-value enum, this
patch changes the logic to:
- little endian if little_endian
- big endian if !little_endian

The native endian case is handled by all users with a trivial helper. This
patch doesn't change any functionality, nor it does add overhead.

Signed-off-by: Greg Kurz gk...@linux.vnet.ibm.com
---

Changes since v4:
- style fixes (I have chosen if ... else in most places to stay below
  80 columns, with the notable exception of the vhost helper which gets
  shorten in a later patch)

 drivers/net/macvtap.c|5 -
 drivers/net/tun.c|5 -
 drivers/vhost/vhost.h|2 +-
 include/linux/virtio_byteorder.h |   24 ++--
 include/linux/virtio_config.h|5 -
 include/linux/vringh.h   |2 +-
 6 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index a2f2958..6cf6b3e 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -51,7 +51,10 @@ struct macvtap_queue {
 
 static inline bool macvtap_is_little_endian(struct macvtap_queue *q)
 {
-   return q-flags  MACVTAP_VNET_LE;
+   if (q-flags  MACVTAP_VNET_LE)
+   return true;
+   else
+   return virtio_legacy_is_little_endian();
 }
 
 static inline u16 macvtap16_to_cpu(struct macvtap_queue *q, __virtio16 val)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 3c3d6c0..5b044d4 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -208,7 +208,10 @@ struct tun_struct {
 
 static inline bool tun_is_little_endian(struct tun_struct *tun)
 {
-   return tun-flags  TUN_VNET_LE;
+   if (tun-flags  TUN_VNET_LE)
+   return true;
+   else
+   return virtio_legacy_is_little_endian();
 }
 
 static inline u16 tun16_to_cpu(struct tun_struct *tun, __virtio16 val)
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 6a49960..954c657 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -175,7 +175,7 @@ static inline bool vhost_has_feature(struct vhost_virtqueue 
*vq, int bit)
 
 static inline bool vhost_is_little_endian(struct vhost_virtqueue *vq)
 {
-   return vhost_has_feature(vq, VIRTIO_F_VERSION_1);
+   return vhost_has_feature(vq, VIRTIO_F_VERSION_1) ? true : 
virtio_legacy_is_little_endian();
 }
 
 /* Memory accessors */
diff --git a/include/linux/virtio_byteorder.h b/include/linux/virtio_byteorder.h
index 51865d0..ce63a2c 100644
--- a/include/linux/virtio_byteorder.h
+++ b/include/linux/virtio_byteorder.h
@@ -3,17 +3,21 @@
 #include linux/types.h
 #include uapi/linux/virtio_types.h
 
-/*
- * Low-level memory accessors for handling virtio in modern little endian and 
in
- * compatibility native endian format.
- */
+static inline bool virtio_legacy_is_little_endian(void)
+{
+#ifdef __LITTLE_ENDIAN
+   return true;
+#else
+   return false;
+#endif
+}
 
 static inline u16 __virtio16_to_cpu(bool little_endian, __virtio16 val)
 {
if (little_endian)
return le16_to_cpu((__force __le16)val);
else
-   return (__force u16)val;
+   return be16_to_cpu((__force __be16)val);
 }
 
 static inline __virtio16 __cpu_to_virtio16(bool little_endian, u16 val)
@@ -21,7 +25,7 @@ static inline __virtio16 __cpu_to_virtio16(bool 
little_endian, u16 val)
if (little_endian)
return (__force __virtio16)cpu_to_le16(val);
else
-   return (__force __virtio16)val;
+   return (__force __virtio16)cpu_to_be16(val);
 }
 
 static inline u32 __virtio32_to_cpu(bool little_endian, __virtio32 val)
@@ -29,7 +33,7 @@ static inline u32 __virtio32_to_cpu(bool little_endian, 
__virtio32 val)
if (little_endian)
return le32_to_cpu((__force __le32)val);
else
-   return (__force u32)val;
+   return be32_to_cpu((__force __be32)val);
 }
 
 static inline __virtio32 __cpu_to_virtio32(bool little_endian, u32 val)
@@ -37,7 +41,7 @@ static inline __virtio32 __cpu_to_virtio32(bool 
little_endian, u32 val)
if (little_endian)
return (__force __virtio32)cpu_to_le32(val);
else
-   return (__force __virtio32)val;
+   return (__force __virtio32)cpu_to_be32(val);
 }
 
 static inline u64 __virtio64_to_cpu(bool little_endian, __virtio64 val)
@@ -45,7 +49,7 @@ static inline u64 __virtio64_to_cpu(bool little_endian, 
__virtio64 val)
if (little_endian)
return le64_to_cpu((__force __le64)val);
else
-   return (__force u64)val;
+   return be64_to_cpu((__force __be64)val);
 }
 
 static inline __virtio64 

[PATCH v5 8/8] macvtap/tun: cross-endian support for little-endian hosts

2015-04-23 Thread Greg Kurz
The VNET_LE flag was introduced to fix accesses to virtio 1.0 headers
that are always little-endian. It can also be used to handle the special
case of a legacy little-endian device implemented by a big-endian host.

Let's add a flag and ioctls for big-endian devices as well. If both flags
are set, little-endian wins.

Since this is isn't a common usecase, the feature is controlled by a kernel
config option (not set by default).

Both macvtap and tun are covered by this patch since they share the same
API with userland.

Signed-off-by: Greg Kurz gk...@linux.vnet.ibm.com
---

Changes since v4:
- rewrote patch title to mention cross-endian
- renamed config to TUN_VNET_CROSS_LE
- rewrote config description and help
- moved ifdefery to top of tun.c and macvtap.c
- updated comment in uapi/linux/if_tun.h to mention that the availibility
  of both SET and GET ioctls depends on the kernel config

 drivers/net/Kconfig |   14 ++
 drivers/net/macvtap.c   |   58 +-
 drivers/net/tun.c   |   60 ++-
 include/uapi/linux/if_tun.h |6 
 4 files changed, 136 insertions(+), 2 deletions(-)

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index df51d60..71ac0ec 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -244,6 +244,20 @@ config TUN
 
  If you don't know what to use this for, you don't need it.
 
+config TUN_VNET_CROSS_LE
+   bool Support for cross-endian vnet headers on little-endian kernels
+   default n
+   ---help---
+ This option allows TUN/TAP and MACVTAP device drivers in a
+ little-endian kernel to parse vnet headers that come from a
+ big-endian legacy virtio device.
+
+ Userspace programs can control the feature using the TUNSETVNETBE
+ and TUNGETVNETBE ioctls.
+
+ Unless you have a little-endian system hosting a big-endian virtual
+ machine with a legacy virtio NIC, you should say N.
+
 config VETH
tristate Virtual ethernet pair device
---help---
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 6cf6b3e..460ed9f 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -48,13 +48,63 @@ struct macvtap_queue {
 #define MACVTAP_FEATURES (IFF_VNET_HDR | IFF_MULTI_QUEUE)
 
 #define MACVTAP_VNET_LE 0x8000
+#define MACVTAP_VNET_BE 0x4000
+
+#ifdef CONFIG_TUN_VNET_CROSS_LE
+static inline bool macvtap_legacy_is_little_endian(struct macvtap_queue *q)
+{
+   if (q-flags  MACVTAP_VNET_BE)
+   return false;
+   return virtio_legacy_is_little_endian();
+}
+
+static long macvtap_get_vnet_be(struct macvtap_queue *q, int __user *sp)
+{
+   int s = !!(q-flags  MACVTAP_VNET_BE);
+
+   if (put_user(s, sp))
+   return -EFAULT;
+
+   return 0;
+}
+
+static long macvtap_set_vnet_be(struct macvtap_queue *q, int __user *sp)
+{
+   int s;
+
+   if (get_user(s, sp))
+   return -EFAULT;
+
+   if (s)
+   q-flags |= MACVTAP_VNET_BE;
+   else
+   q-flags = ~MACVTAP_VNET_BE;
+
+   return 0;
+}
+#else
+static inline bool macvtap_legacy_is_little_endian(struct macvtap_queue *q)
+{
+   return virtio_legacy_is_little_endian();
+}
+
+static long macvtap_get_vnet_be(struct macvtap_queue *q, int __user *argp)
+{
+   return -EINVAL;
+}
+
+static long macvtap_set_vnet_be(struct macvtap_queue *q, int __user *argp)
+{
+   return -EINVAL;
+}
+#endif /* CONFIG_TUN_VNET_CROSS_LE */
 
 static inline bool macvtap_is_little_endian(struct macvtap_queue *q)
 {
if (q-flags  MACVTAP_VNET_LE)
return true;
else
-   return virtio_legacy_is_little_endian();
+   return macvtap_legacy_is_little_endian(q);
 }
 
 static inline u16 macvtap16_to_cpu(struct macvtap_queue *q, __virtio16 val)
@@ -1098,6 +1148,12 @@ static long macvtap_ioctl(struct file *file, unsigned 
int cmd,
q-flags = ~MACVTAP_VNET_LE;
return 0;
 
+   case TUNGETVNETBE:
+   return macvtap_get_vnet_be(q, sp);
+
+   case TUNSETVNETBE:
+   return macvtap_set_vnet_be(q, sp);
+
case TUNSETOFFLOAD:
/* let the user check for future flags */
if (arg  ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 5b044d4..1b0afa9 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -111,6 +111,7 @@ do {
\
 #define TUN_FASYNC IFF_ATTACH_QUEUE
 /* High bits in flags field are unused. */
 #define TUN_VNET_LE 0x8000
+#define TUN_VNET_BE 0x4000
 
 #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
  IFF_MULTI_QUEUE)
@@ -206,12 +207,61 @@ struct tun_struct {
u32 flow_count;
 };
 
+#ifdef CONFIG_TUN_VNET_CROSS_LE
+static inline 

[PATCH v5 7/8] vhost: cross-endian support for legacy devices

2015-04-23 Thread Greg Kurz
This patch brings cross-endian support to vhost when used to implement
legacy virtio devices. Since it is a relatively rare situation, the
feature availability is controlled by a kernel config option (not set
by default).

The vq-is_le boolean field is added to cache the endianness to be
used for ring accesses. It defaults to native endian, as expected
by legacy virtio devices. When the ring gets active, we force little
endian if the device is modern. When the ring is deactivated, we
revert to the native endian default.

If cross-endian was compiled in, a vq-user_be boolean field is added
so that userspace may request a specific endianness. This field is
used to override the default when activating the ring of a legacy
device. It has no effect on modern devices.

Signed-off-by: Greg Kurz gk...@linux.vnet.ibm.com
---

Changes since v4:
- rewrote patch title to mention cross-endian
- renamed config to VHOST_CROSS_ENDIAN_LEGACY
- rewrote config description and help
- moved ifdefery to top of vhost.c
- added a detailed comment about the lifecycle of vq-user_be in
  vhost_init_is_le()
- renamed ioctls to VHOST_[GS]ET_VRING_ENDIAN
- added LE/BE defines to the ioctl API
- rewrote ioctl sanity check with the LE/BE defines
- updated comment in uapi/linux/vhost.h to mention that the availibility
  of both SET and GET ioctls depends on the kernel config

 drivers/vhost/Kconfig  |   15 
 drivers/vhost/vhost.c  |   86 +++-
 drivers/vhost/vhost.h  |   10 +
 include/uapi/linux/vhost.h |   12 ++
 4 files changed, 121 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 017a1e8..74d7380 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -32,3 +32,18 @@ config VHOST
---help---
  This option is selected by any driver which needs to access
  the core of vhost.
+
+config VHOST_CROSS_ENDIAN_LEGACY
+   bool Cross-endian support for vhost
+   default n
+   ---help---
+ This option allows vhost to support guests with a different byte
+ ordering from host.
+
+ Userspace programs can control the feature using the
+ VHOST_SET_VRING_ENDIAN and VHOST_GET_VRING_ENDIAN ioctls.
+
+ This is only useful on a few platforms (ppc64 and arm64). Since it
+ adds some overhead, it is disabled default.
+
+ If unsure, say N.
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2ee2826..8c4390d 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -36,6 +36,78 @@ enum {
 #define vhost_used_event(vq) ((__virtio16 __user *)vq-avail-ring[vq-num])
 #define vhost_avail_event(vq) ((__virtio16 __user *)vq-used-ring[vq-num])
 
+#ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
+static void vhost_vq_reset_user_be(struct vhost_virtqueue *vq)
+{
+   vq-user_be = !virtio_legacy_is_little_endian();
+}
+
+static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user 
*argp)
+{
+   struct vhost_vring_state s;
+
+   if (vq-private_data)
+   return -EBUSY;
+
+   if (copy_from_user(s, argp, sizeof(s)))
+   return -EFAULT;
+
+   if (s.num != VHOST_VRING_LITTLE_ENDIAN 
+   s.num != VHOST_VRING_BIG_ENDIAN)
+   return -EINVAL;
+
+   vq-user_be = s.num;
+
+   return 0;
+}
+
+static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx,
+  int __user *argp)
+{
+   struct vhost_vring_state s = {
+   .index = idx,
+   .num = vq-user_be
+   };
+
+   if (copy_to_user(argp, s, sizeof(s)))
+   return -EFAULT;
+
+   return 0;
+}
+
+static void vhost_init_is_le(struct vhost_virtqueue *vq)
+{
+   /* Note for legacy virtio: user_be is initialized at reset time
+* according to the host endianness. If userspace does not set an
+* explicit endianness, the default behavior is native endian, as
+* expected by legacy virtio.
+*/
+   vq-is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1) || !vq-user_be;
+}
+#else
+static void vhost_vq_reset_user_be(struct vhost_virtqueue *vq)
+{
+   ;
+}
+
+static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user 
*argp)
+{
+   return -ENOIOCTLCMD;
+}
+
+static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx,
+  int __user *argp)
+{
+   return -ENOIOCTLCMD;
+}
+
+static void vhost_init_is_le(struct vhost_virtqueue *vq)
+{
+   if (vhost_has_feature(vq, VIRTIO_F_VERSION_1))
+   vq-is_le = true;
+}
+#endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */
+
 static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
poll_table *pt)
 {
@@ -199,6 +271,8 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq-call = NULL;
vq-log_ctx = NULL;
vq-memory = NULL;
+   vq-is_le = 

Re: [PATCH v5 1/8] virtio: introduce virtio_is_little_endian() helper

2015-04-23 Thread Thomas Huth
Am Thu, 23 Apr 2015 17:26:20 +0200
schrieb Greg Kurz gk...@linux.vnet.ibm.com:

 Signed-off-by: Greg Kurz gk...@linux.vnet.ibm.com
 ---
  include/linux/virtio_config.h |   17 +++--
  1 file changed, 11 insertions(+), 6 deletions(-)
 
 diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
 index ca3ed78..bd1a582 100644
 --- a/include/linux/virtio_config.h
 +++ b/include/linux/virtio_config.h
 @@ -205,35 +205,40 @@ int virtqueue_set_affinity(struct virtqueue *vq, int 
 cpu)
   return 0;
  }
  
 +static inline bool virtio_is_little_endian(struct virtio_device *vdev)
 +{
 + return virtio_has_feature(vdev, VIRTIO_F_VERSION_1);
 +}

So this function returns false when _not_ using version 1, but running
on a little endian host + guest? Sounds confusing. Maybe you could name
it virtio_is_v1() or so instead?

 Thomas
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [GIT PULL] First batch of KVM changes for 4.1

2015-04-23 Thread Marcelo Tosatti
On Thu, Apr 23, 2015 at 02:02:29PM +0200, Paolo Bonzini wrote:
 
 
 On 23/04/2015 13:51, Marcelo Tosatti wrote:
https://bugzilla.redhat.com/show_bug.cgi?id=1174664
   
   That was the missing volatile in an asm.  Older compilers didn't catch
   it. :(
  How do you know that? It looks like memory corruption (look at the
  pattern at the end).
 
 I suspect some kind of operator error there, it makes no sense.


if (unlikely(s-flags  SLAB_POISON))
memset(start, POISON_INUSE, PAGE_SIZE  order);

 *  Padding is done using 0x5a (POISON_INUSE)

 On the other hand, bug 1178975 is much clearer and the symptoms are the
 same.  In that bug, you can see that the same kernel source works on f20
 (package version 3.17.7-200.fc20.x86_64) and fails on f21 (package
 version 3.17.7-300.fc21.x86_64).  Of course the compiler is different.
 The newer one hoists the lsl out of the loop; if you get a CPU migration
 at the wrong time, the cpu != cpu1 condition will always be true the
 loop will never exit.
 
 Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 1/8] virtio: introduce virtio_is_little_endian() helper

2015-04-23 Thread Thomas Huth
Am Thu, 23 Apr 2015 19:22:15 +0200
schrieb Thomas Huth th...@redhat.com:

 Am Thu, 23 Apr 2015 17:26:20 +0200
 schrieb Greg Kurz gk...@linux.vnet.ibm.com:
 
  Signed-off-by: Greg Kurz gk...@linux.vnet.ibm.com
  ---
   include/linux/virtio_config.h |   17 +++--
   1 file changed, 11 insertions(+), 6 deletions(-)
  
  diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
  index ca3ed78..bd1a582 100644
  --- a/include/linux/virtio_config.h
  +++ b/include/linux/virtio_config.h
  @@ -205,35 +205,40 @@ int virtqueue_set_affinity(struct virtqueue *vq, int 
  cpu)
  return 0;
   }
   
  +static inline bool virtio_is_little_endian(struct virtio_device *vdev)
  +{
  +   return virtio_has_feature(vdev, VIRTIO_F_VERSION_1);
  +}
 
 So this function returns false when _not_ using version 1, but running
 on a little endian host + guest? Sounds confusing. Maybe you could name
 it virtio_is_v1() or so instead?

Ah, never mind, I should have looked at patch 6 first, then it makes
sense. (maybe you could put a note to the later patch in this patch
description?)

 Thomas
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 1/8] virtio: introduce virtio_is_little_endian() helper

2015-04-23 Thread Thomas Huth
On Thu, 23 Apr 2015 17:26:20 +0200
Greg Kurz gk...@linux.vnet.ibm.com wrote:

 Signed-off-by: Greg Kurz gk...@linux.vnet.ibm.com
 ---
  include/linux/virtio_config.h |   17 +++--
  1 file changed, 11 insertions(+), 6 deletions(-)
 
 diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
 index ca3ed78..bd1a582 100644
 --- a/include/linux/virtio_config.h
 +++ b/include/linux/virtio_config.h
 @@ -205,35 +205,40 @@ int virtqueue_set_affinity(struct virtqueue *vq, int 
 cpu)
   return 0;
  }
  
 +static inline bool virtio_is_little_endian(struct virtio_device *vdev)
 +{
 + return virtio_has_feature(vdev, VIRTIO_F_VERSION_1);
 +}
 +
  /* Memory accessors */
  static inline u16 virtio16_to_cpu(struct virtio_device *vdev, __virtio16 val)
  {
 - return __virtio16_to_cpu(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), 
 val);
 + return __virtio16_to_cpu(virtio_is_little_endian(vdev), val);
  }
  
  static inline __virtio16 cpu_to_virtio16(struct virtio_device *vdev, u16 val)
  {
 - return __cpu_to_virtio16(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), 
 val);
 + return __cpu_to_virtio16(virtio_is_little_endian(vdev), val);
  }
  
  static inline u32 virtio32_to_cpu(struct virtio_device *vdev, __virtio32 val)
  {
 - return __virtio32_to_cpu(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), 
 val);
 + return __virtio32_to_cpu(virtio_is_little_endian(vdev), val);
  }
  
  static inline __virtio32 cpu_to_virtio32(struct virtio_device *vdev, u32 val)
  {
 - return __cpu_to_virtio32(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), 
 val);
 + return __cpu_to_virtio32(virtio_is_little_endian(vdev), val);
  }
  
  static inline u64 virtio64_to_cpu(struct virtio_device *vdev, __virtio64 val)
  {
 - return __virtio64_to_cpu(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), 
 val);
 + return __virtio64_to_cpu(virtio_is_little_endian(vdev), val);
  }
  
  static inline __virtio64 cpu_to_virtio64(struct virtio_device *vdev, u64 val)
  {
 - return __cpu_to_virtio64(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), 
 val);
 + return __cpu_to_virtio64(virtio_is_little_endian(vdev), val);
  }

Reviewed-by: Thomas Huth th...@redhat.com
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 2/8] tun: add tun_is_little_endian() helper

2015-04-23 Thread Thomas Huth
On Thu, 23 Apr 2015 17:26:30 +0200
Greg Kurz gk...@linux.vnet.ibm.com wrote:

 Signed-off-by: Greg Kurz gk...@linux.vnet.ibm.com
 ---
  drivers/net/tun.c |9 +++--
  1 file changed, 7 insertions(+), 2 deletions(-)
 
 diff --git a/drivers/net/tun.c b/drivers/net/tun.c
 index 857dca4..3c3d6c0 100644
 --- a/drivers/net/tun.c
 +++ b/drivers/net/tun.c
 @@ -206,14 +206,19 @@ struct tun_struct {
   u32 flow_count;
  };
  
 +static inline bool tun_is_little_endian(struct tun_struct *tun)
 +{
 + return tun-flags  TUN_VNET_LE;
 +}
 +
  static inline u16 tun16_to_cpu(struct tun_struct *tun, __virtio16 val)
  {
 - return __virtio16_to_cpu(tun-flags  TUN_VNET_LE, val);
 + return __virtio16_to_cpu(tun_is_little_endian(tun), val);
  }
  
  static inline __virtio16 cpu_to_tun16(struct tun_struct *tun, u16 val)
  {
 - return __cpu_to_virtio16(tun-flags  TUN_VNET_LE, val);
 + return __cpu_to_virtio16(tun_is_little_endian(tun), val);
  }

Reviewed-by: Thomas Huth th...@redhat.com

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] KVM: arm64: add active register handling to GICv3 emulation as well

2015-04-23 Thread Andre Przywara
Commit 47a98b15ba7c (arm/arm64: KVM: support for un-queuing active
IRQs) introduced handling of the GICD_I[SC]ACTIVER registers,
but only for the GICv2 emulation. For the sake of completeness and
as this is a pre-requisite for save/restore of the GICv3 distributor
state, we should also emulate their handling in the distributor and
redistributor frames of an emulated GICv3.

Signed-off-by: Andre Przywara andre.przyw...@arm.com
---
 virt/kvm/arm/vgic-v3-emul.c |   54 +++
 1 file changed, 50 insertions(+), 4 deletions(-)

diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c
index e9c3a7a..2b369de 100644
--- a/virt/kvm/arm/vgic-v3-emul.c
+++ b/virt/kvm/arm/vgic-v3-emul.c
@@ -173,6 +173,32 @@ static bool handle_mmio_clear_pending_reg_dist(struct 
kvm_vcpu *vcpu,
return false;
 }
 
+static bool handle_mmio_set_active_reg_dist(struct kvm_vcpu *vcpu,
+   struct kvm_exit_mmio *mmio,
+   phys_addr_t offset)
+{
+   if (likely(offset = VGIC_NR_PRIVATE_IRQS / 8))
+   return vgic_handle_set_active_reg(vcpu-kvm, mmio, offset,
+  vcpu-vcpu_id);
+
+   vgic_reg_access(mmio, NULL, offset,
+   ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+   return false;
+}
+
+static bool handle_mmio_clear_active_reg_dist(struct kvm_vcpu *vcpu,
+ struct kvm_exit_mmio *mmio,
+ phys_addr_t offset)
+{
+   if (likely(offset = VGIC_NR_PRIVATE_IRQS / 8))
+   return vgic_handle_clear_active_reg(vcpu-kvm, mmio, offset,
+   vcpu-vcpu_id);
+
+   vgic_reg_access(mmio, NULL, offset,
+   ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+   return false;
+}
+
 static bool handle_mmio_priority_reg_dist(struct kvm_vcpu *vcpu,
  struct kvm_exit_mmio *mmio,
  phys_addr_t offset)
@@ -428,13 +454,13 @@ static const struct vgic_io_range vgic_v3_dist_ranges[] = 
{
.base   = GICD_ISACTIVER,
.len= 0x80,
.bits_per_irq   = 1,
-   .handle_mmio= handle_mmio_raz_wi,
+   .handle_mmio= handle_mmio_set_active_reg_dist,
},
{
.base   = GICD_ICACTIVER,
.len= 0x80,
.bits_per_irq   = 1,
-   .handle_mmio= handle_mmio_raz_wi,
+   .handle_mmio= handle_mmio_clear_active_reg_dist,
},
{
.base   = GICD_IPRIORITYR,
@@ -561,6 +587,26 @@ static bool handle_mmio_clear_enable_reg_redist(struct 
kvm_vcpu *vcpu,
  ACCESS_WRITE_CLEARBIT);
 }
 
+static bool handle_mmio_set_active_reg_redist(struct kvm_vcpu *vcpu,
+ struct kvm_exit_mmio *mmio,
+ phys_addr_t offset)
+{
+   struct kvm_vcpu *redist_vcpu = mmio-private;
+
+   return vgic_handle_set_active_reg(vcpu-kvm, mmio, offset,
+ redist_vcpu-vcpu_id);
+}
+
+static bool handle_mmio_clear_active_reg_redist(struct kvm_vcpu *vcpu,
+   struct kvm_exit_mmio *mmio,
+   phys_addr_t offset)
+{
+   struct kvm_vcpu *redist_vcpu = mmio-private;
+
+   return vgic_handle_clear_active_reg(vcpu-kvm, mmio, offset,
+redist_vcpu-vcpu_id);
+}
+
 static bool handle_mmio_set_pending_reg_redist(struct kvm_vcpu *vcpu,
   struct kvm_exit_mmio *mmio,
   phys_addr_t offset)
@@ -674,13 +720,13 @@ static const struct vgic_io_range vgic_redist_ranges[] = {
.base   = SGI_base(GICR_ISACTIVER0),
.len= 0x04,
.bits_per_irq   = 1,
-   .handle_mmio= handle_mmio_raz_wi,
+   .handle_mmio= handle_mmio_set_active_reg_redist,
},
{
.base   = SGI_base(GICR_ICACTIVER0),
.len= 0x04,
.bits_per_irq   = 1,
-   .handle_mmio= handle_mmio_raz_wi,
+   .handle_mmio= handle_mmio_clear_active_reg_redist,
},
{
.base   = SGI_base(GICR_IPRIORITYR0),
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [v6] kvm/fpu: Enable fully eager restore kvm FPU

2015-04-23 Thread H. Peter Anvin
On 04/23/2015 08:28 AM, Dave Hansen wrote:
 On 04/23/2015 02:13 PM, Liang Li wrote:
 When compiling kernel on westmere, the performance of eager FPU
 is about 0.4% faster than lazy FPU.
 
 Do you have an theory why this is?  What does the regression come from?
 

This is interesting since previous measurements on KVM have had the
exact opposite results.  I think we need to understand this a lot more.

-hpa


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 4/8] vringh: introduce vringh_is_little_endian() helper

2015-04-23 Thread Thomas Huth
On Thu, 23 Apr 2015 17:26:52 +0200
Greg Kurz gk...@linux.vnet.ibm.com wrote:

 Signed-off-by: Greg Kurz gk...@linux.vnet.ibm.com
 ---
  include/linux/vringh.h |   17 +++--
  1 file changed, 11 insertions(+), 6 deletions(-)
 
 diff --git a/include/linux/vringh.h b/include/linux/vringh.h
 index a3fa537..3ed62ef 100644
 --- a/include/linux/vringh.h
 +++ b/include/linux/vringh.h
 @@ -226,33 +226,38 @@ static inline void vringh_notify(struct vringh *vrh)
   vrh-notify(vrh);
  }
  
 +static inline bool vringh_is_little_endian(const struct vringh *vrh)
 +{
 + return vrh-little_endian;
 +}
 +
  static inline u16 vringh16_to_cpu(const struct vringh *vrh, __virtio16 val)
  {
 - return __virtio16_to_cpu(vrh-little_endian, val);
 + return __virtio16_to_cpu(vringh_is_little_endian(vrh), val);
  }
  
  static inline __virtio16 cpu_to_vringh16(const struct vringh *vrh, u16 val)
  {
 - return __cpu_to_virtio16(vrh-little_endian, val);
 + return __cpu_to_virtio16(vringh_is_little_endian(vrh), val);
  }
  
  static inline u32 vringh32_to_cpu(const struct vringh *vrh, __virtio32 val)
  {
 - return __virtio32_to_cpu(vrh-little_endian, val);
 + return __virtio32_to_cpu(vringh_is_little_endian(vrh), val);
  }
  
  static inline __virtio32 cpu_to_vringh32(const struct vringh *vrh, u32 val)
  {
 - return __cpu_to_virtio32(vrh-little_endian, val);
 + return __cpu_to_virtio32(vringh_is_little_endian(vrh), val);
  }
  
  static inline u64 vringh64_to_cpu(const struct vringh *vrh, __virtio64 val)
  {
 - return __virtio64_to_cpu(vrh-little_endian, val);
 + return __virtio64_to_cpu(vringh_is_little_endian(vrh), val);
  }
  
  static inline __virtio64 cpu_to_vringh64(const struct vringh *vrh, u64 val)
  {
 - return __cpu_to_virtio64(vrh-little_endian, val);
 + return __cpu_to_virtio64(vringh_is_little_endian(vrh), val);
  }
  #endif /* _LINUX_VRINGH_H */

Reviewed-by: Thomas Huth th...@redhat.com
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 5/8] vhost: introduce vhost_is_little_endian() helper

2015-04-23 Thread Thomas Huth
On Thu, 23 Apr 2015 17:27:05 +0200
Greg Kurz gk...@linux.vnet.ibm.com wrote:

 Signed-off-by: Greg Kurz gk...@linux.vnet.ibm.com
 ---
  drivers/vhost/vhost.h |   17 +++--
  1 file changed, 11 insertions(+), 6 deletions(-)
 
 diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
 index 8c1c792..6a49960 100644
 --- a/drivers/vhost/vhost.h
 +++ b/drivers/vhost/vhost.h
 @@ -173,34 +173,39 @@ static inline bool vhost_has_feature(struct 
 vhost_virtqueue *vq, int bit)
   return vq-acked_features  (1ULL  bit);
  }
  
 +static inline bool vhost_is_little_endian(struct vhost_virtqueue *vq)
 +{
 + return vhost_has_feature(vq, VIRTIO_F_VERSION_1);
 +}
 +
  /* Memory accessors */
  static inline u16 vhost16_to_cpu(struct vhost_virtqueue *vq, __virtio16 val)
  {
 - return __virtio16_to_cpu(vhost_has_feature(vq, VIRTIO_F_VERSION_1), 
 val);
 + return __virtio16_to_cpu(vhost_is_little_endian(vq), val);
  }
  
  static inline __virtio16 cpu_to_vhost16(struct vhost_virtqueue *vq, u16 val)
  {
 - return __cpu_to_virtio16(vhost_has_feature(vq, VIRTIO_F_VERSION_1), 
 val);
 + return __cpu_to_virtio16(vhost_is_little_endian(vq), val);
  }
  
  static inline u32 vhost32_to_cpu(struct vhost_virtqueue *vq, __virtio32 val)
  {
 - return __virtio32_to_cpu(vhost_has_feature(vq, VIRTIO_F_VERSION_1), 
 val);
 + return __virtio32_to_cpu(vhost_is_little_endian(vq), val);
  }
  
  static inline __virtio32 cpu_to_vhost32(struct vhost_virtqueue *vq, u32 val)
  {
 - return __cpu_to_virtio32(vhost_has_feature(vq, VIRTIO_F_VERSION_1), 
 val);
 + return __cpu_to_virtio32(vhost_is_little_endian(vq), val);
  }
  
  static inline u64 vhost64_to_cpu(struct vhost_virtqueue *vq, __virtio64 val)
  {
 - return __virtio64_to_cpu(vhost_has_feature(vq, VIRTIO_F_VERSION_1), 
 val);
 + return __virtio64_to_cpu(vhost_is_little_endian(vq), val);
  }
  
  static inline __virtio64 cpu_to_vhost64(struct vhost_virtqueue *vq, u64 val)
  {
 - return __cpu_to_virtio64(vhost_has_feature(vq, VIRTIO_F_VERSION_1), 
 val);
 + return __cpu_to_virtio64(vhost_is_little_endian(vq), val);
  }
  #endif

Reviewed-by: Thomas Huth th...@redhat.com
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kvmclock: set scheduler clock stable

2015-04-23 Thread Luiz Capitulino
If you try to enable NOHZ_FULL on a guest today, you'll get
the following error when the guest tries to deactivate the
scheduler tick:

 WARNING: CPU: 3 PID: 2182 at kernel/time/tick-sched.c:192 
can_stop_full_tick+0xb9/0x290()
 NO_HZ FULL will not work with unstable sched clock
 CPU: 3 PID: 2182 Comm: kworker/3:1 Not tainted 4.0.0-10545-gb9bb6fb #204
 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
 Workqueue: events flush_to_ldisc
  8162a0c7 88011f583e88 814e6ba0 0002
  88011f583ed8 88011f583ec8 8104d095 88011f583eb8
   0003 0001 0001
 Call Trace:
  IRQ  [814e6ba0] dump_stack+0x4f/0x7b
  [8104d095] warn_slowpath_common+0x85/0xc0
  [8104d146] warn_slowpath_fmt+0x46/0x50
  [810bd2a9] can_stop_full_tick+0xb9/0x290
  [810bd9ed] tick_nohz_irq_exit+0x8d/0xb0
  [810511c5] irq_exit+0xc5/0x130
  [814f180a] smp_apic_timer_interrupt+0x4a/0x60
  [814eff5e] apic_timer_interrupt+0x6e/0x80
  EOI  [814ee5d1] ? _raw_spin_unlock_irqrestore+0x31/0x60
  [8108bbc8] __wake_up+0x48/0x60
  [8134836c] n_tty_receive_buf_common+0x49c/0xba0
  [8134a6bf] ? tty_ldisc_ref+0x1f/0x70
  [81348a84] n_tty_receive_buf2+0x14/0x20
  [8134b390] flush_to_ldisc+0xe0/0x120
  [81064d05] process_one_work+0x1d5/0x540
  [81064c81] ? process_one_work+0x151/0x540
  [81065191] worker_thread+0x121/0x470
  [81065070] ? process_one_work+0x540/0x540
  [8106b4df] kthread+0xef/0x110
  [8106b3f0] ? __kthread_parkme+0xa0/0xa0
  [814ef4f2] ret_from_fork+0x42/0x70
  [8106b3f0] ? __kthread_parkme+0xa0/0xa0
 ---[ end trace 06e3507544a38866 ]---

However, it turns out that kvmclock does provide a stable
sched_clock callback. So, let the scheduler know this which
in turn makes NOHZ_FULL work in the guest.

Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
---

PS: Original author of this patch is Marcelo. I did most of the
testing and backported it to an older real-time kernel tree. Works
like a charm.

 arch/x86/kernel/kvmclock.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 42caaef..4e03921 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -24,6 +24,7 @@
 #include linux/percpu.h
 #include linux/hardirq.h
 #include linux/memblock.h
+#include linux/sched.h
 
 #include asm/x86_init.h
 #include asm/reboot.h
@@ -265,6 +266,8 @@ void __init kvmclock_init(void)
 
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+
+   set_sched_clock_stable();
 }
 
 int __init kvm_setup_vsyscall_timeinfo(void)
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 3/8] macvtap: introduce macvtap_is_little_endian() helper

2015-04-23 Thread Thomas Huth
On Thu, 23 Apr 2015 17:26:41 +0200
Greg Kurz gk...@linux.vnet.ibm.com wrote:

 Signed-off-by: Greg Kurz gk...@linux.vnet.ibm.com
 ---
  drivers/net/macvtap.c |9 +++--
  1 file changed, 7 insertions(+), 2 deletions(-)
 
 diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
 index 27ecc5c..a2f2958 100644
 --- a/drivers/net/macvtap.c
 +++ b/drivers/net/macvtap.c
 @@ -49,14 +49,19 @@ struct macvtap_queue {
  
  #define MACVTAP_VNET_LE 0x8000
  
 +static inline bool macvtap_is_little_endian(struct macvtap_queue *q)
 +{
 + return q-flags  MACVTAP_VNET_LE;
 +}
 +
  static inline u16 macvtap16_to_cpu(struct macvtap_queue *q, __virtio16 val)
  {
 - return __virtio16_to_cpu(q-flags  MACVTAP_VNET_LE, val);
 + return __virtio16_to_cpu(macvtap_is_little_endian(q), val);
  }
  
  static inline __virtio16 cpu_to_macvtap16(struct macvtap_queue *q, u16 val)
  {
 - return __cpu_to_virtio16(q-flags  MACVTAP_VNET_LE, val);
 + return __cpu_to_virtio16(macvtap_is_little_endian(q), val);
  }

Reviewed-by: Thomas Huth th...@redhat.com
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 6/8] virtio: add explicit big-endian support to memory accessors

2015-04-23 Thread Thomas Huth
On Thu, 23 Apr 2015 17:29:06 +0200
Greg Kurz gk...@linux.vnet.ibm.com wrote:

 The current memory accessors logic is:
 - little endian if little_endian
 - native endian (i.e. no byteswap) if !little_endian
 
 If we want to fully support cross-endian vhost, we also need to be
 able to convert to big endian.
 
 Instead of changing the little_endian argument to some 3-value enum, this
 patch changes the logic to:
 - little endian if little_endian
 - big endian if !little_endian
 
 The native endian case is handled by all users with a trivial helper. This
 patch doesn't change any functionality, nor it does add overhead.
 
 Signed-off-by: Greg Kurz gk...@linux.vnet.ibm.com
 ---
 
 Changes since v4:
 - style fixes (I have chosen if ... else in most places to stay below
   80 columns, with the notable exception of the vhost helper which gets
   shorten in a later patch)
 
  drivers/net/macvtap.c|5 -
  drivers/net/tun.c|5 -
  drivers/vhost/vhost.h|2 +-
  include/linux/virtio_byteorder.h |   24 ++--
  include/linux/virtio_config.h|5 -
  include/linux/vringh.h   |2 +-
  6 files changed, 28 insertions(+), 15 deletions(-)
 
 diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
 index a2f2958..6cf6b3e 100644
 --- a/drivers/net/macvtap.c
 +++ b/drivers/net/macvtap.c
 @@ -51,7 +51,10 @@ struct macvtap_queue {
  
  static inline bool macvtap_is_little_endian(struct macvtap_queue *q)
  {
 - return q-flags  MACVTAP_VNET_LE;
 + if (q-flags  MACVTAP_VNET_LE)
 + return true;
 + else
 + return virtio_legacy_is_little_endian();

simply:

return (q-flags  MACVTAP_VNET_LE) ||
   virtio_legacy_is_little_endian();

?

  }
  
  static inline u16 macvtap16_to_cpu(struct macvtap_queue *q, __virtio16 val)
 diff --git a/drivers/net/tun.c b/drivers/net/tun.c
 index 3c3d6c0..5b044d4 100644
 --- a/drivers/net/tun.c
 +++ b/drivers/net/tun.c
 @@ -208,7 +208,10 @@ struct tun_struct {
  
  static inline bool tun_is_little_endian(struct tun_struct *tun)
  {
 - return tun-flags  TUN_VNET_LE;
 + if (tun-flags  TUN_VNET_LE)
 + return true;
 + else
 + return virtio_legacy_is_little_endian();

dito?

  }
  
  static inline u16 tun16_to_cpu(struct tun_struct *tun, __virtio16 val)
 diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
 index 6a49960..954c657 100644
 --- a/drivers/vhost/vhost.h
 +++ b/drivers/vhost/vhost.h
 @@ -175,7 +175,7 @@ static inline bool vhost_has_feature(struct 
 vhost_virtqueue *vq, int bit)
  
  static inline bool vhost_is_little_endian(struct vhost_virtqueue *vq)
  {
 - return vhost_has_feature(vq, VIRTIO_F_VERSION_1);
 + return vhost_has_feature(vq, VIRTIO_F_VERSION_1) ? true : 
 virtio_legacy_is_little_endian();
  }

That line is way longer than 80 characters ... may I suggest to switch
at least here to:

return vhost_has_feature(vq, VIRTIO_F_VERSION_1) ||
   virtio_legacy_is_little_endian();

?

Apart from the cosmetics, the patch looks good to me.

 Thomas
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [v6] kvm/fpu: Enable fully eager restore kvm FPU

2015-04-23 Thread Zhang, Yang Z
H. Peter Anvin wrote on 2015-04-24:
 On 04/23/2015 08:28 AM, Dave Hansen wrote:
 On 04/23/2015 02:13 PM, Liang Li wrote:
 When compiling kernel on westmere, the performance of eager FPU is
 about 0.4% faster than lazy FPU.
 
 Do you have an theory why this is?  What does the regression come from?
 
 
 This is interesting since previous measurements on KVM have had the
 exact opposite results.  I think we need to understand this a lot more.

What I can tell is that vmexit is heavy. So it is reasonable to see the 
improvement under some cases, especially kernel is using eager FPU now which 
means each schedule may trigger a vmexit.

 
   -hpa



Best regards,
Yang


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: s390: remove delayed reallocation of page tables for KVM

2015-04-23 Thread Cole Robinson
On 04/23/2015 08:07 AM, Paolo Bonzini wrote:
 
 
 On 23/04/2015 13:43, Christian Borntraeger wrote:

 Couldn't you make this a hidden kconfig option that gets automatically 
 selected when kvm is enabled? Or is there a non-kvm case that needs it too?
 For things like RHEV the default could certainly be enabled, but for normal
 distros like SLES/RHEL, the idea was to NOT enable that by default, as the 
 non-KVM
 case is more common and might suffer from the additional memory consumption 
 of
 the page tables. (big databases come to mind)

 We could think about having rpms like kvm to provide a sysctl file that sets 
 it if we
 want to minimize the impact. Other ideas?
 
 I can say what _won't_ work which is tying it to the KVM module.
 Nowadays it is loaded automatically via udev on the first /dev/kvm
 access, and that's already too late because qemu-kvm's page tables have
 been created already.  Right?
 
 With my Fedora hat on, adding a sysctl file to the userspace RPMs (e.g.
 qemu) would work.  CCing Cole Robinson who is the main maintainer of the
 Fedora virt packages.
 

From a packaging POV that sounds fine to me

- Cole
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [GSoC] project proposal

2015-04-23 Thread Stefan Hajnoczi
On Wed, Apr 22, 2015 at 9:51 AM, Catalin Vasile
catalinvasil...@gmail.com wrote:
 On Wed, Apr 22, 2015 at 11:20 AM, Stefan Hajnoczi stefa...@gmail.com wrote:
 On Tue, Apr 21, 2015 at 04:07:56PM +0200, Paolo Bonzini wrote:
 On 21/04/2015 16:07, Catalin Vasile wrote:
  I don't get the part with getting cryptodev upstream.
  I don't know what getting cryptodev upstream actually implies.
  From what I know cryptodev is done (is a functional project) that was
  rejected in the Linux Kernel
  and there isn't actually way to get it upstream.

 Yes, I agree.

 The limitations of AF_ALG need to addressed somehow, so what is the next
 step?

 Stefan

 If we want a mainstream userspace backend that could interact with a
 lot of crypto engines, we could use OpenSSL (it can actually use
 cryptodev and AF_ALG as engines).
 For now, until mid June (my diploma project presentation) I still want
 to use vhost as a backend for the sole purpose of having a finished
 backend which now I have a good grasp upon.

I understand.

Once you have a first approximation of the new virtio crypto device
interface, I suggest continuing the discussion with the VIRTIO working
group:
https://www.oasis-open.org/committees/tc_home.php?wg_abbrev=virtio#feedback

If you send a virtio spec proposal you can get feedback.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [GIT PULL] First batch of KVM changes for 4.1

2015-04-23 Thread Paolo Bonzini


On 22/04/2015 23:21, Marcelo Tosatti wrote:
 On Mon, Apr 20, 2015 at 01:27:58PM -0700, Andy Lutomirski wrote:
 On Mon, Apr 20, 2015 at 9:59 AM, Paolo Bonzini pbonz...@redhat.com wrote:


 On 17/04/2015 22:18, Marcelo Tosatti wrote:
 The bug which this is fixing is very rare, have no memory of a report.

 In fact, its even difficult to create a synthetic reproducer.

 But then why was the task migration notifier even in Jeremy's original
 code for Xen?  Was it supposed to work even on non-synchronized TSC?

 If that's the case, then it could be reverted indeed; but then why did
 you commit this patch to 4.1?  Did you think of something that would
 cause the seqcount-like protocol to fail, and that turned out not to be
 the case later?  I was only following the mailing list sparsely in March.

 I don't think anyone ever tried that hard to test this stuff.  There
 was an infinte loop that Firefox was triggering as a KVM guest
 somewhat reliably until a couple months ago in the same vdso code.  :(
 
 https://bugzilla.redhat.com/show_bug.cgi?id=1174664

That was the missing volatile in an asm.  Older compilers didn't catch
it. :(

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[v6] kvm/fpu: Enable fully eager restore kvm FPU

2015-04-23 Thread Liang Li
Romove lazy FPU logic and use eager FPU entirely. Eager FPU does
not have performance regression, and it can simplify the code.

When compiling kernel on westmere, the performance of eager FPU
is about 0.4% faster than lazy FPU.

Signed-off-by: Liang Li liang.z...@intel.com
Signed-off-by: Xudong Hao xudong@intel.com
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/svm.c  | 22 ++--
 arch/x86/kvm/vmx.c  | 74 +++--
 arch/x86/kvm/x86.c  |  8 +
 include/linux/kvm_host.h|  2 --
 5 files changed, 9 insertions(+), 98 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dea2e7e..5d84cc9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -743,7 +743,6 @@ struct kvm_x86_ops {
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
-   void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
 
void (*tlb_flush)(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ce741b8..1b3b29b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1087,7 +1087,6 @@ static void init_vmcb(struct vcpu_svm *svm)
struct vmcb_control_area *control = svm-vmcb-control;
struct vmcb_save_area *save = svm-vmcb-save;
 
-   svm-vcpu.fpu_active = 1;
svm-vcpu.arch.hflags = 0;
 
set_cr_intercept(svm, INTERCEPT_CR0_READ);
@@ -1529,15 +1528,12 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
ulong gcr0 = svm-vcpu.arch.cr0;
u64 *hcr0 = svm-vmcb-save.cr0;
 
-   if (!svm-vcpu.fpu_active)
-   *hcr0 |= SVM_CR0_SELECTIVE_MASK;
-   else
-   *hcr0 = (*hcr0  ~SVM_CR0_SELECTIVE_MASK)
-   | (gcr0  SVM_CR0_SELECTIVE_MASK);
+   *hcr0 = (*hcr0  ~SVM_CR0_SELECTIVE_MASK)
+   | (gcr0  SVM_CR0_SELECTIVE_MASK);
 
mark_dirty(svm-vmcb, VMCB_CR);
 
-   if (gcr0 == *hcr0  svm-vcpu.fpu_active) {
+   if (gcr0 == *hcr0) {
clr_cr_intercept(svm, INTERCEPT_CR0_READ);
clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
} else {
@@ -1568,8 +1564,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned 
long cr0)
if (!npt_enabled)
cr0 |= X86_CR0_PG | X86_CR0_WP;
 
-   if (!vcpu-fpu_active)
-   cr0 |= X86_CR0_TS;
/*
 * re-enable caching here because the QEMU bios
 * does not do it - this results in some delay at
@@ -1795,7 +1789,6 @@ static void svm_fpu_activate(struct kvm_vcpu *vcpu)
 
clr_exception_intercept(svm, NM_VECTOR);
 
-   svm-vcpu.fpu_active = 1;
update_cr0_intercept(svm);
 }
 
@@ -4139,14 +4132,6 @@ static bool svm_has_wbinvd_exit(void)
return true;
 }
 
-static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
-{
-   struct vcpu_svm *svm = to_svm(vcpu);
-
-   set_exception_intercept(svm, NM_VECTOR);
-   update_cr0_intercept(svm);
-}
-
 #define PRE_EX(exit)  { .exit_code = (exit), \
.stage = X86_ICPT_PRE_EXCEPT, }
 #define POST_EX(exit) { .exit_code = (exit), \
@@ -4381,7 +4366,6 @@ static struct kvm_x86_ops svm_x86_ops = {
.cache_reg = svm_cache_reg,
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
-   .fpu_deactivate = svm_fpu_deactivate,
 
.tlb_flush = svm_flush_tlb,
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f5e8dce..811a666 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1567,7 +1567,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
u32 eb;
 
eb = (1u  PF_VECTOR) | (1u  UD_VECTOR) | (1u  MC_VECTOR) |
-(1u  NM_VECTOR) | (1u  DB_VECTOR);
+(1u  DB_VECTOR);
if ((vcpu-guest_debug 
 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
@@ -1576,8 +1576,6 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
eb = ~0;
if (enable_ept)
eb = ~(1u  PF_VECTOR); /* bypass_guest_pf = 0 */
-   if (vcpu-fpu_active)
-   eb = ~(1u  NM_VECTOR);
 
/* When we are running a nested L2 guest and L1 specified for it a
 * certain exception bitmap, we must trap the same exceptions and pass
@@ -1961,9 +1959,6 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
 {
ulong cr0;
 
-   if (vcpu-fpu_active)
-   return;
-   vcpu-fpu_active = 1;
cr0 = vmcs_readl(GUEST_CR0);
cr0 = ~(X86_CR0_TS | X86_CR0_MP);
cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
@@ -1994,33 +1989,6 @@ static inline unsigned long nested_read_cr4(struct 
vmcs12 *fields)
(fields-cr4_read_shadow  

[PATCH] ARM: KVM: Remove pointless void pointer cast

2015-04-23 Thread Firo Yang
No need to cast the void pointer returned by kmalloc() in
arch/arm/kvm/mmu.c::kvm_alloc_stage2_pgd().

Signed-off-by: Firo Yang fir...@gmail.com
---
 arch/arm/kvm/mmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 1d5accb..ce0bce4 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -691,8 +691,8 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
 * work.  This is not used by the hardware and we have no
 * alignment requirement for this allocation.
 */
-   pgd = (pgd_t *)kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t),
-  GFP_KERNEL | __GFP_ZERO);
+   pgd = kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t),
+   GFP_KERNEL | __GFP_ZERO);
 
if (!pgd) {
kvm_free_hwpgd(hwpgd);
-- 
2.1.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [v6] kvm/fpu: Enable fully eager restore kvm FPU

2015-04-23 Thread Paolo Bonzini


On 23/04/2015 23:13, Liang Li wrote:
 Romove lazy FPU logic and use eager FPU entirely. Eager FPU does
 not have performance regression, and it can simplify the code.
 
 When compiling kernel on westmere, the performance of eager FPU
 is about 0.4% faster than lazy FPU.
 
 Signed-off-by: Liang Li liang.z...@intel.com
 Signed-off-by: Xudong Hao xudong@intel.com

A patch like this requires much more benchmarking than what you have done.

First, what guest did you use?  A modern Linux guest will hardly ever exit
to userspace: the scheduler uses the TSC deadline timer, which is handled
in the kernel; the clocksource uses the TSC; virtio-blk devices are kicked
via ioeventfd.

What happens if you time a Windows guest (without any Hyper-V enlightenments),
or if you use clocksource=acpi_pm?

Second, 0.4% by itself may not be statistically significant.  How did
you gather the result?  How many times did you run the benchmark?  Did
the guest report any stolen time?


And finally, even if the patch was indeed a performance improvement,
there is much more that you can remove.  fpu_active is always 1, 
vmx_fpu_activate only has one call site that can be simplified just to

vcpu-arch.cr0_guest_owned_bits = X86_CR0_TS;
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu-arch.cr0_guest_owned_bits);

and so on.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 03/10] KVM: arm: guest debug, define API headers

2015-04-23 Thread Alex Bennée

Christoffer Dall christoffer.d...@linaro.org writes:

 On Tue, Mar 31, 2015 at 04:08:01PM +0100, Alex Bennée wrote:
 This commit defines the API headers for guest debugging. There are two
 architecture specific debug structures:
 
   - kvm_guest_debug_arch, allows us to pass in HW debug registers
   - kvm_debug_exit_arch, signals the exact debug exit and pc
 
 The type of debugging being used is control by the architecture specific
 control bits of the kvm_guest_debug-control flags in the ioctl
 structure.
 
 Signed-off-by: Alex Bennée alex.ben...@linaro.org
 
 ---
 v2
- expose hsr and pc directly to user-space
 
 diff --git a/arch/arm64/include/uapi/asm/kvm.h 
 b/arch/arm64/include/uapi/asm/kvm.h
 index 3ef77a4..6ee70a0 100644
 --- a/arch/arm64/include/uapi/asm/kvm.h
 +++ b/arch/arm64/include/uapi/asm/kvm.h
 @@ -100,10 +100,24 @@ struct kvm_sregs {
  struct kvm_fpu {
  };
  
 +/*
 + * See ARM ARM D7.3: Debug Registers

 see the ARM ARM for ??

 + *
 + * The control registers are architecturally defined as 32 bits but are
 + * stored as 64 bit values along side the value registers and aligned

 do you mean alongside?

sure.


 + * with the rest 64 bit registers in the normal CPU context.

 rest of the 64 bit

 + */

 why do we store them as 64 bit values?  There's nothing prevented us
 from defining them as __u32 is there?  Is this to make the ONE_REG
 interface accessers more convenient?

No but it will involve more fiddling when we copy them into the CPU
context which keeps everything as 64 bit aligned. Of course if we want
to remove the debug registers from the context and put a pointer in
place then this is fairly moot as we will need to change the hyp.S code
that copies the registers during the world switch. I was trying to
minimise the amount of change to the assembler in this series.


 +#define KVM_ARM_NDBG_REGS 16

 nit: is NDBG short for something I don't know about or is it
 the number of debug registers we are noting here, in which case I think
 KVM_ARM_NUM_DBG_REGS is more clear.

OK.


  struct kvm_guest_debug_arch {
 +__u64 dbg_bcr[KVM_ARM_NDBG_REGS];
 +__u64 dbg_bvr[KVM_ARM_NDBG_REGS];
 +__u64 dbg_wcr[KVM_ARM_NDBG_REGS];
 +__u64 dbg_wvr[KVM_ARM_NDBG_REGS];
  };
  
  struct kvm_debug_exit_arch {
 +__u64 pc;
 +__u32 hsr;
  };
  
  struct kvm_sync_regs {
 @@ -207,4 +221,11 @@ struct kvm_arch_memory_slot {
  
  #endif
  
 +/*
 + * Architecture related debug defines - upper 16 bits of
 + * kvm_guest_debug-control
 + */
 +#define KVM_GUESTDBG_USE_SW_BP  __KVM_GUESTDBG_USE_SW_BP
 +#define KVM_GUESTDBG_USE_HW_BP  __KVM_GUESTDBG_USE_HW_BP
 +
  #endif /* __ARM_KVM_H__ */
 -- 
 2.3.4
 

 Thanks,
 -Christoffer

-- 
Alex Bennée
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] ARM: KVM: Remove pointless void pointer cast

2015-04-23 Thread Paolo Bonzini


On 23/04/2015 12:07, Firo Yang wrote:
 No need to cast the void pointer returned by kmalloc() in
 arch/arm/kvm/mmu.c::kvm_alloc_stage2_pgd().
 
 Signed-off-by: Firo Yang fir...@gmail.com
 ---
  arch/arm/kvm/mmu.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)
 
 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index 1d5accb..ce0bce4 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -691,8 +691,8 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
* work.  This is not used by the hardware and we have no
* alignment requirement for this allocation.
*/
 - pgd = (pgd_t *)kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t),
 -GFP_KERNEL | __GFP_ZERO);
 + pgd = kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t),
 + GFP_KERNEL | __GFP_ZERO);
  
   if (!pgd) {
   kvm_free_hwpgd(hwpgd);
 

Acked-by: Paolo Bonzini pbonz...@redhat.com
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] KVM: s390: remove delayed reallocation of page tables for KVM

2015-04-23 Thread Christian Borntraeger
From: Martin Schwidefsky schwidef...@de.ibm.com

Replacing a 2K page table with a 4K page table while a VMA is active
for the affected memory region is fundamentally broken. Rip out the
page table reallocation code and replace it with a simple system
control 'vm.allocate_pgste'. If the system control is set the page
tables for all processes are allocated as full 4K pages, even for
processes that do not need it.

Signed-off-by: Martin Schwidefsky schwidef...@de.ibm.com
Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com
---
 arch/s390/include/asm/mmu.h |   4 +-
 arch/s390/include/asm/mmu_context.h |   3 +-
 arch/s390/include/asm/pgalloc.h |   1 +
 arch/s390/include/asm/pgtable.h |   9 +++
 arch/s390/kvm/Kconfig   |  16 
 arch/s390/mm/pgtable.c  | 142 +++-
 6 files changed, 74 insertions(+), 101 deletions(-)

diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index a5e6562..d29ad95 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -14,7 +14,9 @@ typedef struct {
unsigned long asce_bits;
unsigned long asce_limit;
unsigned long vdso_base;
-   /* The mmu context has extended page tables. */
+   /* The mmu context allocates 4K page tables. */
+   unsigned int alloc_pgste:1;
+   /* The mmu context uses extended page tables. */
unsigned int has_pgste:1;
/* The mmu context uses storage keys. */
unsigned int use_skey:1;
diff --git a/arch/s390/include/asm/mmu_context.h 
b/arch/s390/include/asm/mmu_context.h
index 8fb3802..8b91128 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -21,9 +21,10 @@ static inline int init_new_context(struct task_struct *tsk,
mm-context.asce_bits = _ASCE_TABLE_LENGTH | _ASCE_USER_BITS;
 #ifdef CONFIG_64BIT
mm-context.asce_bits |= _ASCE_TYPE_REGION3;
-#endif
+   mm-context.alloc_pgste = page_table_allocate_pgste;
mm-context.has_pgste = 0;
mm-context.use_skey = 0;
+#endif
mm-context.asce_limit = STACK_TOP_MAX;
crst_table_init((unsigned long *) mm-pgd, pgd_entry_type(mm));
return 0;
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index 3009c2b..e0c5834 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -21,6 +21,7 @@ void crst_table_free(struct mm_struct *, unsigned long *);
 unsigned long *page_table_alloc(struct mm_struct *);
 void page_table_free(struct mm_struct *, unsigned long *);
 void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long);
+extern int page_table_allocate_pgste;
 
 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
  unsigned long key, bool nq);
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index e08ec38..4aaea1d 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -492,6 +492,15 @@ static inline int mm_has_pgste(struct mm_struct *mm)
return 0;
 }
 
+static inline int mm_alloc_pgste(struct mm_struct *mm)
+{
+#ifdef CONFIG_PGSTE
+   if (unlikely(mm-context.alloc_pgste))
+   return 1;
+#endif
+   return 0;
+}
+
 /*
  * In the case that a guest uses storage keys
  * faults should no longer be backed by zero pages
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index 5fce52c..031b5db 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -51,6 +51,22 @@ config KVM_S390_UCONTROL
 
  If unsure, say N.
 
+config KVM_S390_PGSTE_DEFAULT
+   bool Allocate page status table extension (PGSTE) by default
+   depends on KVM
+   ---help---
+ To start a KVM guest the page tables for the process need
+ to be allocated with the page status table extension.
+
+ The system control 'vm.allocate_pgste' is queried at process
+ creation if 4K page tables with the PGSTE are required or if
+ 2K page tables are sufficient.
+
+ This option sets the default for 'vm.allocate_pgste'. If
+ you compile a kernel to be used for a KVM host, say Y.
+
+ If unsure, say N.
+
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/vhost/Kconfig
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index b2c1542..d0612d6 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -18,6 +18,7 @@
 #include linux/rcupdate.h
 #include linux/slab.h
 #include linux/swapops.h
+#include linux/sysctl.h
 #include linux/ksm.h
 #include linux/mman.h
 
@@ -928,6 +929,40 @@ unsigned long get_guest_storage_key(struct mm_struct *mm, 
unsigned long addr)
 }
 EXPORT_SYMBOL(get_guest_storage_key);
 
+static int page_table_allocate_pgste_min = 0;
+static int page_table_allocate_pgste_max = 1;
+int page_table_allocate_pgste = 

[PATCH] page table bugfix for s390/kvm

2015-04-23 Thread Christian Borntraeger
Paolo, Alex,

we plan to submit this fixup via Martins s390 tree (as it is all in
common s390 memory management).

It fixes a fundamental design bug in our page table handling. Some
background: Normal page tables are 2kb. For KVM we need a special page
table extension that creates another 2k after the page table (pgste).
As there are some workloads which have a high page table footprint
(e.g. data  bases with thousands of processes on shared memory), we
want to miminize the impact of the page table extensions to just KVM
processes. Now: our approach of replacing the page table on CREATE_VM
or ENABLE_SIE has a fundamental race to code that gets page table
pointers or ptl locks without holding the pmd lock or page table lock.
So here is another approach: Have a sysctl (with a KCONFIG default)
that decides if we need 4k page tables or 2k page tables.

KVM then needs this sysctl to be set, otherwise CREATE_VM will
return EINVAL.


Martin Schwidefsky (1):
  KVM: s390: remove delayed reallocation of page tables for KVM

 arch/s390/include/asm/mmu.h |   4 +-
 arch/s390/include/asm/mmu_context.h |   3 +-
 arch/s390/include/asm/pgalloc.h |   1 +
 arch/s390/include/asm/pgtable.h |   9 +++
 arch/s390/kvm/Kconfig   |  16 
 arch/s390/mm/pgtable.c  | 142 +++-
 6 files changed, 74 insertions(+), 101 deletions(-)

-- 
2.3.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [v6] kvm/fpu: Enable fully eager restore kvm FPU

2015-04-23 Thread Jan Kiszka
On 2015-04-23 12:40, Paolo Bonzini wrote:
 
 
 On 23/04/2015 23:13, Liang Li wrote:
 Romove lazy FPU logic and use eager FPU entirely. Eager FPU does
 not have performance regression, and it can simplify the code.

 When compiling kernel on westmere, the performance of eager FPU
 is about 0.4% faster than lazy FPU.

 Signed-off-by: Liang Li liang.z...@intel.com
 Signed-off-by: Xudong Hao xudong@intel.com
 
 A patch like this requires much more benchmarking than what you have done.
 
 First, what guest did you use?  A modern Linux guest will hardly ever exit
 to userspace: the scheduler uses the TSC deadline timer, which is handled
 in the kernel; the clocksource uses the TSC; virtio-blk devices are kicked
 via ioeventfd.
 
 What happens if you time a Windows guest (without any Hyper-V enlightenments),
 or if you use clocksource=acpi_pm?
 
 Second, 0.4% by itself may not be statistically significant.  How did
 you gather the result?  How many times did you run the benchmark?  Did
 the guest report any stolen time?
 
 
 And finally, even if the patch was indeed a performance improvement,
 there is much more that you can remove.  fpu_active is always 1, 
 vmx_fpu_activate only has one call site that can be simplified just to
 
 vcpu-arch.cr0_guest_owned_bits = X86_CR0_TS;
 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu-arch.cr0_guest_owned_bits);
 
 and so on.

And it would be good to know how the benchmarks look like on other CPUs
than the chosen Intel model. Including older ones.

Jan

-- 
Siemens AG, Corporate Technology, CT RTC ITP SES-DE
Corporate Competence Center Embedded Linux
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [GIT PULL] First batch of KVM changes for 4.1

2015-04-23 Thread Paolo Bonzini


On 23/04/2015 00:55, Marcelo Tosatti wrote:
 On Wed, Apr 22, 2015 at 11:01:49PM +0200, Paolo Bonzini wrote:


 On 22/04/2015 22:56, Marcelo Tosatti wrote:
 But then why was the task migration notifier even in Jeremy's original
 code for Xen? 
 To cover for the vcpu1 - vcpu2 - vcpu1 case, i believe.

 Ok, to cover it for non-synchronized TSC.  While KVM requires
 synchronized TSC.

 If that's the case, then it could be reverted indeed; but then why did
 you commit this patch to 4.1? 

 Because it fixes the problem Andy reported (see Subject: KVM: x86: fix
 kvmclock write race (v2) on kvm@). As long as you have Radim's
 fix on top.

 But if it's so rare, and it was known that fixing the host protocol was
 just as good a solution, why was the guest fix committed?
 
 I don't know. Should have fixed the host protocol.

No problem.  Let's do the right thing now.

 I'm just trying to understand.  I am worried that this patch was rushed
 in; so far I had assumed it wasn't (a revert of a revert is rare enough
 that you don't do it lightly...) but maybe I was wrong.
 
 Yes it was rushed in.

Ok, so re-reverted it will be.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: s390: remove delayed reallocation of page tables for KVM

2015-04-23 Thread Christian Borntraeger
Am 23.04.2015 um 13:37 schrieb Alexander Graf:
 
 
 Am 23.04.2015 um 13:08 schrieb Christian Borntraeger 
 borntrae...@de.ibm.com:

 From: Martin Schwidefsky schwidef...@de.ibm.com

 Replacing a 2K page table with a 4K page table while a VMA is active
 for the affected memory region is fundamentally broken. Rip out the
 page table reallocation code and replace it with a simple system
 control 'vm.allocate_pgste'. If the system control is set the page
 tables for all processes are allocated as full 4K pages, even for
 processes that do not need it.

 Signed-off-by: Martin Schwidefsky schwidef...@de.ibm.com
 Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com
 
 Couldn't you make this a hidden kconfig option that gets automatically 
 selected when kvm is enabled? Or is there a non-kvm case that needs it too?

For things like RHEV the default could certainly be enabled, but for normal
distros like SLES/RHEL, the idea was to NOT enable that by default, as the 
non-KVM
case is more common and might suffer from the additional memory consumption of
the page tables. (big databases come to mind)

We could think about having rpms like kvm to provide a sysctl file that sets it 
if we
want to minimize the impact. Other ideas?

Christian

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: s390: remove delayed reallocation of page tables for KVM

2015-04-23 Thread Alexander Graf


 Am 23.04.2015 um 13:43 schrieb Christian Borntraeger borntrae...@de.ibm.com:
 
 Am 23.04.2015 um 13:37 schrieb Alexander Graf:
 
 
 Am 23.04.2015 um 13:08 schrieb Christian Borntraeger 
 borntrae...@de.ibm.com:
 
 From: Martin Schwidefsky schwidef...@de.ibm.com
 
 Replacing a 2K page table with a 4K page table while a VMA is active
 for the affected memory region is fundamentally broken. Rip out the
 page table reallocation code and replace it with a simple system
 control 'vm.allocate_pgste'. If the system control is set the page
 tables for all processes are allocated as full 4K pages, even for
 processes that do not need it.
 
 Signed-off-by: Martin Schwidefsky schwidef...@de.ibm.com
 Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com
 
 Couldn't you make this a hidden kconfig option that gets automatically 
 selected when kvm is enabled? Or is there a non-kvm case that needs it too?
 
 For things like RHEV the default could certainly be enabled, but for normal
 distros like SLES/RHEL, the idea was to NOT enable that by default, as the 
 non-KVM
 case is more common and might suffer from the additional memory consumption of
 the page tables. (big databases come to mind)
 
 We could think about having rpms like kvm to provide a sysctl file that sets 
 it if we
 want to minimize the impact. Other ideas?

Oh, I'm sorry, I misread the ifdef. I don't think it makes sense to have a 
config option for the default value then, just rely only on sysctl.conf for 
changed defaults.

As far as mechanisms to change it go, every distribution has their own ways of 
dealing with this. RH has a profile thing, we don't really have anything 
central, but individual sysctl.d files for example that a kvm package could 
provide.

Either way, the default choosing shouldn't happen in .config ;). Also, please 
add some helpful error message in qemu to guide users to the sysctl.

As far as alternative approaches go, I don't have a great idea otoh. We could 
have an elf flag indicating that this process needs 4k page tables to limit the 
impact to a single process. In fact, could we maybe still limit the scope to 
non-global? A personality may work as well. Or ulimit?


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: s390: remove delayed reallocation of page tables for KVM

2015-04-23 Thread Christian Borntraeger
Am 23.04.2015 um 14:01 schrieb Alexander Graf:
 
 
 Am 23.04.2015 um 13:43 schrieb Christian Borntraeger 
 borntrae...@de.ibm.com:

 Am 23.04.2015 um 13:37 schrieb Alexander Graf:


 Am 23.04.2015 um 13:08 schrieb Christian Borntraeger 
 borntrae...@de.ibm.com:

 From: Martin Schwidefsky schwidef...@de.ibm.com

 Replacing a 2K page table with a 4K page table while a VMA is active
 for the affected memory region is fundamentally broken. Rip out the
 page table reallocation code and replace it with a simple system
 control 'vm.allocate_pgste'. If the system control is set the page
 tables for all processes are allocated as full 4K pages, even for
 processes that do not need it.

 Signed-off-by: Martin Schwidefsky schwidef...@de.ibm.com
 Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com

 Couldn't you make this a hidden kconfig option that gets automatically 
 selected when kvm is enabled? Or is there a non-kvm case that needs it too?

 For things like RHEV the default could certainly be enabled, but for normal
 distros like SLES/RHEL, the idea was to NOT enable that by default, as the 
 non-KVM
 case is more common and might suffer from the additional memory consumption 
 of
 the page tables. (big databases come to mind)

 We could think about having rpms like kvm to provide a sysctl file that sets 
 it if we
 want to minimize the impact. Other ideas?
 
 Oh, I'm sorry, I misread the ifdef. I don't think it makes sense to have a 
 config option for the default value then, just rely only on sysctl.conf for 
 changed defaults.
 
 As far as mechanisms to change it go, every distribution has their own ways 
 of dealing with this. RH has a profile thing, we don't really have anything 
 central, but individual sysctl.d files for example that a kvm package could 
 provide.
 Either way, the default choosing shouldn't happen in .config ;).

So you vote for getting rid of the Kconfig?

Also, please add some helpful error message in qemu to guide users to the 
sysctl.

Yes, we will provide a qemu patch (cc stable) after this hits the kernel.

 As far as alternative approaches go, I don't have a great idea otoh. We could 
 have an elf flag indicating that this process needs 4k page tables to limit 
 the impact to a single process.

This approach was actually Martins first fix. The problem is that the decision 
takes place on execve,
but we need an answer at fork time. So we always started with 4k page tables 
and freed the 2nd halv on
execve. Now this did not work for processes that only fork (without execve).

 In fact, could we maybe still limit the scope to non-global? A personality 
 may work as well. Or ulimit?

I think we will go for now with the sysctl and see if we can come up with some 
automatic way as additional
patch later on.

Christian



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: s390: remove delayed reallocation of page tables for KVM

2015-04-23 Thread Alexander Graf


 Am 23.04.2015 um 13:08 schrieb Christian Borntraeger borntrae...@de.ibm.com:
 
 From: Martin Schwidefsky schwidef...@de.ibm.com
 
 Replacing a 2K page table with a 4K page table while a VMA is active
 for the affected memory region is fundamentally broken. Rip out the
 page table reallocation code and replace it with a simple system
 control 'vm.allocate_pgste'. If the system control is set the page
 tables for all processes are allocated as full 4K pages, even for
 processes that do not need it.
 
 Signed-off-by: Martin Schwidefsky schwidef...@de.ibm.com
 Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com

Couldn't you make this a hidden kconfig option that gets automatically selected 
when kvm is enabled? Or is there a non-kvm case that needs it too?


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kvm: x86: fix kvmclock update protocol

2015-04-23 Thread Paolo Bonzini
From: Radim Krčmář rkrc...@redhat.com

The kvmclock spec says that the host will increment a version field to
an odd number, then update stuff, then increment it to an even number.
The host is buggy and doesn't do this, and the result is observable
when one vcpu reads another vcpu's kvmclock data.

There's no good way for a guest kernel to keep its vdso from reading
a different vcpu's kvmclock data, but we don't need to care about
changing VCPUs as long as we read a consistent data from kvmclock.
(VCPU can change outside of this loop too, so it doesn't matter if we
return a value not fit for this VCPU.)

Based on a patch by Radim Krčmář.

Signed-off-by: Paolo Bonzini pbonz...@redhat.com
---
 arch/x86/kvm/x86.c | 33 -
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ed31c31b2485..c73efcd03e29 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1669,12 +1669,28 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
guest_hv_clock, sizeof(guest_hv_clock
return 0;
 
-   /*
-* The interface expects us to write an even number signaling that the
-* update is finished. Since the guest won't see the intermediate
-* state, we just increase by 2 at the end.
+   /* This VCPU is paused, but it's legal for a guest to read another
+* VCPU's kvmclock, so we really have to follow the specification where
+* it says that version is odd if data is being modified, and even after
+* it is consistent.
+*
+* Version field updates must be kept separate.  This is because
+* kvm_write_guest_cached might use a rep movs instruction, and
+* writes within a string instruction are weakly ordered.  So there
+* are three writes overall.
+*
+* As a small optimization, only write the version field in the first
+* and third write.  The vcpu-pv_time cache is still valid, because the
+* version field is the first in the struct.
 */
-   vcpu-hv_clock.version = guest_hv_clock.version + 2;
+   BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+   vcpu-hv_clock.version = guest_hv_clock.version + 1;
+   kvm_write_guest_cached(v-kvm, vcpu-pv_time,
+   vcpu-hv_clock,
+   sizeof(vcpu-hv_clock.version));
+
+   smp_wmb();
 
/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
pvclock_flags = (guest_hv_clock.flags  PVCLOCK_GUEST_STOPPED);
@@ -1695,6 +1711,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
kvm_write_guest_cached(v-kvm, vcpu-pv_time,
vcpu-hv_clock,
sizeof(vcpu-hv_clock));
+
+   smp_wmb();
+
+   vcpu-hv_clock.version++;
+   kvm_write_guest_cached(v-kvm, vcpu-pv_time,
+   vcpu-hv_clock,
+   sizeof(vcpu-hv_clock.version));
return 0;
 }
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] x86: pvclock: Really remove the sched notifier for cross-cpu migrations

2015-04-23 Thread Paolo Bonzini
This reverts commits 0a4e6be9ca17c54817cf814b4b5aa60478c6df27
and 80f7fdb1c7f0f9266421f823964fd1962681f6ce.

The task migration notifier was originally introduced in order to support
the pvclock vsyscall with non-synchronized TSC, but KVM only supports it
with synchronized TSC.  Hence, on KVM the race condition is only needed
due to a bad implementation on the host side, and even then it's so rare
that it's mostly theoretical.

As far as KVM is concerned it's possible to fix the host, avoiding the
additional complexity in the vDSO and the (re)introduction of the task
migration notifier.

Xen, on the other hand, hasn't yet implemented vsyscall support at
all, so we do not care about its plans for non-synchronized TSC.

Reported-by: Peter Zijlstra pet...@infradead.org
Suggested-by: Marcelo Tosatti mtosa...@redhat.com
Signed-off-by: Paolo Bonzini pbonz...@redhat.com
---
I can put this in my next KVM pull request.

 arch/x86/include/asm/pvclock.h |  1 -
 arch/x86/kernel/pvclock.c  | 44 --
 arch/x86/vdso/vclock_gettime.c | 34 ++--
 include/linux/sched.h  |  8 
 kernel/sched/core.c| 15 --
 5 files changed, 15 insertions(+), 87 deletions(-)

diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 25b1cc07d496..d6b078e9fa28 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -95,7 +95,6 @@ unsigned __pvclock_read_cycles(const struct 
pvclock_vcpu_time_info *src,
 
 struct pvclock_vsyscall_time_info {
struct pvclock_vcpu_time_info pvti;
-   u32 migrate_count;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
 #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index e5ecd20e72dd..2f355d229a58 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -141,46 +141,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock 
*wall_clock,
set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
 }
 
-static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
-
-static struct pvclock_vsyscall_time_info *
-pvclock_get_vsyscall_user_time_info(int cpu)
-{
-   if (!pvclock_vdso_info) {
-   BUG();
-   return NULL;
-   }
-
-   return pvclock_vdso_info[cpu];
-}
-
-struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
-{
-   return pvclock_get_vsyscall_user_time_info(cpu)-pvti;
-}
-
 #ifdef CONFIG_X86_64
-static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
-   void *v)
-{
-   struct task_migration_notifier *mn = v;
-   struct pvclock_vsyscall_time_info *pvti;
-
-   pvti = pvclock_get_vsyscall_user_time_info(mn-from_cpu);
-
-   /* this is NULL when pvclock vsyscall is not initialized */
-   if (unlikely(pvti == NULL))
-   return NOTIFY_DONE;
-
-   pvti-migrate_count++;
-
-   return NOTIFY_DONE;
-}
-
-static struct notifier_block pvclock_migrate = {
-   .notifier_call = pvclock_task_migrate,
-};
-
 /*
  * Initialize the generic pvclock vsyscall state.  This will allocate
  * a/some page(s) for the per-vcpu pvclock information, set up a
@@ -194,17 +155,12 @@ int __init pvclock_init_vsyscall(struct 
pvclock_vsyscall_time_info *i,
 
WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
 
-   pvclock_vdso_info = i;
-
for (idx = 0; idx = (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
__set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
 __pa(i) + (idx*PAGE_SIZE),
 PAGE_KERNEL_VVAR);
}
 
-
-   register_task_migration_notifier(pvclock_migrate);
-
return 0;
 }
 #endif
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 40d2473836c9..9793322751e0 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -82,15 +82,18 @@ static notrace cycle_t vread_pvclock(int *mode)
cycle_t ret;
u64 last;
u32 version;
-   u32 migrate_count;
u8 flags;
unsigned cpu, cpu1;
 
 
/*
-* When looping to get a consistent (time-info, tsc) pair, we
-* also need to deal with the possibility we can switch vcpus,
-* so make sure we always re-fetch time-info for the current vcpu.
+* Note: hypervisor must guarantee that:
+* 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
+* 2. that per-CPU pvclock time info is updated if the
+*underlying CPU changes.
+* 3. that version is increased whenever underlying CPU
+*changes.
+*
 */
do {
cpu = __getcpu()  VGETCPU_CPU_MASK;
@@ -99,27 +102,20 @@ static notrace cycle_t vread_pvclock(int *mode)
 * __getcpu() calls (Gleb).
 */
 
-  

Re: [GIT PULL] First batch of KVM changes for 4.1

2015-04-23 Thread Marcelo Tosatti
On Thu, Apr 23, 2015 at 11:13:23AM +0200, Paolo Bonzini wrote:
 
 
 On 22/04/2015 23:21, Marcelo Tosatti wrote:
  On Mon, Apr 20, 2015 at 01:27:58PM -0700, Andy Lutomirski wrote:
  On Mon, Apr 20, 2015 at 9:59 AM, Paolo Bonzini pbonz...@redhat.com wrote:
 
 
  On 17/04/2015 22:18, Marcelo Tosatti wrote:
  The bug which this is fixing is very rare, have no memory of a report.
 
  In fact, its even difficult to create a synthetic reproducer.
 
  But then why was the task migration notifier even in Jeremy's original
  code for Xen?  Was it supposed to work even on non-synchronized TSC?
 
  If that's the case, then it could be reverted indeed; but then why did
  you commit this patch to 4.1?  Did you think of something that would
  cause the seqcount-like protocol to fail, and that turned out not to be
  the case later?  I was only following the mailing list sparsely in March.
 
  I don't think anyone ever tried that hard to test this stuff.  There
  was an infinte loop that Firefox was triggering as a KVM guest
  somewhat reliably until a couple months ago in the same vdso code.  :(
  
  https://bugzilla.redhat.com/show_bug.cgi?id=1174664
 
 That was the missing volatile in an asm.  Older compilers didn't catch
 it. :(

How do you know that? It looks like memory corruption (look at the
pattern at the end).


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [GIT PULL] First batch of KVM changes for 4.1

2015-04-23 Thread Paolo Bonzini


On 23/04/2015 13:51, Marcelo Tosatti wrote:
   https://bugzilla.redhat.com/show_bug.cgi?id=1174664
  
  That was the missing volatile in an asm.  Older compilers didn't catch
  it. :(
 How do you know that? It looks like memory corruption (look at the
 pattern at the end).

I suspect some kind of operator error there, it makes no sense.

On the other hand, bug 1178975 is much clearer and the symptoms are the
same.  In that bug, you can see that the same kernel source works on f20
(package version 3.17.7-200.fc20.x86_64) and fails on f21 (package
version 3.17.7-300.fc21.x86_64).  Of course the compiler is different.
The newer one hoists the lsl out of the loop; if you get a CPU migration
at the wrong time, the cpu != cpu1 condition will always be true the
loop will never exit.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: s390: remove delayed reallocation of page tables for KVM

2015-04-23 Thread Paolo Bonzini


On 23/04/2015 13:43, Christian Borntraeger wrote:
  
  Couldn't you make this a hidden kconfig option that gets automatically 
  selected when kvm is enabled? Or is there a non-kvm case that needs it too?
 For things like RHEV the default could certainly be enabled, but for normal
 distros like SLES/RHEL, the idea was to NOT enable that by default, as the 
 non-KVM
 case is more common and might suffer from the additional memory consumption of
 the page tables. (big databases come to mind)
 
 We could think about having rpms like kvm to provide a sysctl file that sets 
 it if we
 want to minimize the impact. Other ideas?

I can say what _won't_ work which is tying it to the KVM module.
Nowadays it is loaded automatically via udev on the first /dev/kvm
access, and that's already too late because qemu-kvm's page tables have
been created already.  Right?

With my Fedora hat on, adding a sysctl file to the userspace RPMs (e.g.
qemu) would work.  CCing Cole Robinson who is the main maintainer of the
Fedora virt packages.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: s390: remove delayed reallocation of page tables for KVM

2015-04-23 Thread Martin Schwidefsky
On Thu, 23 Apr 2015 14:01:23 +0200
Alexander Graf ag...@suse.de wrote:

 As far as alternative approaches go, I don't have a great idea otoh.
 We could have an elf flag indicating that this process needs 4k page
 tables to limit the impact to a single process. In fact, could we
 maybe still limit the scope to non-global? A personality may work
 as well. Or ulimit?

I tried the ELF flag approach, does not work. The trouble is that
allocate_mm() has to create the page tables with 4K tables if you
want to change the page table layout later on. We have learned the
hard way that the direction 2K to 4K does not work due to races
in the mm.

Now there are two major cases: 1) fork + execve and 2) fork only.
The ELF flag can be used to reduce from 4K to 2K for 1) but not 2).
2) is required for apps that use lots of forking, e.g. database or
web servers. Same goes for the approach with a personality flag or
ulimit.

We would have to distinguish the two cases for allocate_mm(),
if the new mm is allocated for a fork the current mm decides
2K vs. 4K. If the new mm is allocated by binfmt_elf, then start
with 4K and do the downgrade after the ELF flag has been evaluated.

-- 
blue skies,
   Martin.

Reality continues to ruin my life. - Calvin.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [v6] kvm/fpu: Enable fully eager restore kvm FPU

2015-04-23 Thread Wanpeng Li
Cc Rik, who is doing the similar work. :)
On Fri, Apr 24, 2015 at 05:13:03AM +0800, Liang Li wrote:
Romove lazy FPU logic and use eager FPU entirely. Eager FPU does
not have performance regression, and it can simplify the code.

When compiling kernel on westmere, the performance of eager FPU
is about 0.4% faster than lazy FPU.

Signed-off-by: Liang Li liang.z...@intel.com
Signed-off-by: Xudong Hao xudong@intel.com
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/svm.c  | 22 ++--
 arch/x86/kvm/vmx.c  | 74 +++--
 arch/x86/kvm/x86.c  |  8 +
 include/linux/kvm_host.h|  2 --
 5 files changed, 9 insertions(+), 98 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dea2e7e..5d84cc9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -743,7 +743,6 @@ struct kvm_x86_ops {
   void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
   unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
   void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
-  void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
 
   void (*tlb_flush)(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ce741b8..1b3b29b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1087,7 +1087,6 @@ static void init_vmcb(struct vcpu_svm *svm)
   struct vmcb_control_area *control = svm-vmcb-control;
   struct vmcb_save_area *save = svm-vmcb-save;
 
-  svm-vcpu.fpu_active = 1;
   svm-vcpu.arch.hflags = 0;
 
   set_cr_intercept(svm, INTERCEPT_CR0_READ);
@@ -1529,15 +1528,12 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
   ulong gcr0 = svm-vcpu.arch.cr0;
   u64 *hcr0 = svm-vmcb-save.cr0;
 
-  if (!svm-vcpu.fpu_active)
-  *hcr0 |= SVM_CR0_SELECTIVE_MASK;
-  else
-  *hcr0 = (*hcr0  ~SVM_CR0_SELECTIVE_MASK)
-  | (gcr0  SVM_CR0_SELECTIVE_MASK);
+  *hcr0 = (*hcr0  ~SVM_CR0_SELECTIVE_MASK)
+  | (gcr0  SVM_CR0_SELECTIVE_MASK);
 
   mark_dirty(svm-vmcb, VMCB_CR);
 
-  if (gcr0 == *hcr0  svm-vcpu.fpu_active) {
+  if (gcr0 == *hcr0) {
   clr_cr_intercept(svm, INTERCEPT_CR0_READ);
   clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
   } else {
@@ -1568,8 +1564,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned 
long cr0)
   if (!npt_enabled)
   cr0 |= X86_CR0_PG | X86_CR0_WP;
 
-  if (!vcpu-fpu_active)
-  cr0 |= X86_CR0_TS;
   /*
* re-enable caching here because the QEMU bios
* does not do it - this results in some delay at
@@ -1795,7 +1789,6 @@ static void svm_fpu_activate(struct kvm_vcpu *vcpu)
 
   clr_exception_intercept(svm, NM_VECTOR);
 
-  svm-vcpu.fpu_active = 1;
   update_cr0_intercept(svm);
 }
 
@@ -4139,14 +4132,6 @@ static bool svm_has_wbinvd_exit(void)
   return true;
 }
 
-static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
-{
-  struct vcpu_svm *svm = to_svm(vcpu);
-
-  set_exception_intercept(svm, NM_VECTOR);
-  update_cr0_intercept(svm);
-}
-
 #define PRE_EX(exit)  { .exit_code = (exit), \
   .stage = X86_ICPT_PRE_EXCEPT, }
 #define POST_EX(exit) { .exit_code = (exit), \
@@ -4381,7 +4366,6 @@ static struct kvm_x86_ops svm_x86_ops = {
   .cache_reg = svm_cache_reg,
   .get_rflags = svm_get_rflags,
   .set_rflags = svm_set_rflags,
-  .fpu_deactivate = svm_fpu_deactivate,
 
   .tlb_flush = svm_flush_tlb,
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f5e8dce..811a666 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1567,7 +1567,7 @@ static void update_exception_bitmap(struct kvm_vcpu 
*vcpu)
   u32 eb;
 
   eb = (1u  PF_VECTOR) | (1u  UD_VECTOR) | (1u  MC_VECTOR) |
-   (1u  NM_VECTOR) | (1u  DB_VECTOR);
+   (1u  DB_VECTOR);
   if ((vcpu-guest_debug 
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
   (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
@@ -1576,8 +1576,6 @@ static void update_exception_bitmap(struct kvm_vcpu 
*vcpu)
   eb = ~0;
   if (enable_ept)
   eb = ~(1u  PF_VECTOR); /* bypass_guest_pf = 0 */
-  if (vcpu-fpu_active)
-  eb = ~(1u  NM_VECTOR);
 
   /* When we are running a nested L2 guest and L1 specified for it a
* certain exception bitmap, we must trap the same exceptions and pass
@@ -1961,9 +1959,6 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
 {
   ulong cr0;
 
-  if (vcpu-fpu_active)
-  return;
-  vcpu-fpu_active = 1;
   cr0 = vmcs_readl(GUEST_CR0);
   cr0 = ~(X86_CR0_TS | X86_CR0_MP);
   cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
@@ -1994,33 +1989,6 @@ static inline unsigned long nested_read_cr4(struct 
vmcs12 *fields)
 

Re: [v6] kvm/fpu: Enable fully eager restore kvm FPU

2015-04-23 Thread Wanpeng Li
On Fri, Apr 24, 2015 at 05:13:03AM +0800, Liang Li wrote:
Romove lazy FPU logic and use eager FPU entirely. Eager FPU does
not have performance regression, and it can simplify the code.

When compiling kernel on westmere, the performance of eager FPU
is about 0.4% faster than lazy FPU.

Signed-off-by: Liang Li liang.z...@intel.com
Signed-off-by: Xudong Hao xudong@intel.com
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/svm.c  | 22 ++--
 arch/x86/kvm/vmx.c  | 74 +++--
 arch/x86/kvm/x86.c  |  8 +
 include/linux/kvm_host.h|  2 --
 5 files changed, 9 insertions(+), 98 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dea2e7e..5d84cc9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -743,7 +743,6 @@ struct kvm_x86_ops {
   void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
   unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
   void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
-  void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
 
   void (*tlb_flush)(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ce741b8..1b3b29b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1087,7 +1087,6 @@ static void init_vmcb(struct vcpu_svm *svm)
   struct vmcb_control_area *control = svm-vmcb-control;
   struct vmcb_save_area *save = svm-vmcb-save;
 
-  svm-vcpu.fpu_active = 1;
   svm-vcpu.arch.hflags = 0;
 
   set_cr_intercept(svm, INTERCEPT_CR0_READ);
@@ -1529,15 +1528,12 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
   ulong gcr0 = svm-vcpu.arch.cr0;
   u64 *hcr0 = svm-vmcb-save.cr0;
 
-  if (!svm-vcpu.fpu_active)
-  *hcr0 |= SVM_CR0_SELECTIVE_MASK;
-  else
-  *hcr0 = (*hcr0  ~SVM_CR0_SELECTIVE_MASK)
-  | (gcr0  SVM_CR0_SELECTIVE_MASK);
+  *hcr0 = (*hcr0  ~SVM_CR0_SELECTIVE_MASK)
+  | (gcr0  SVM_CR0_SELECTIVE_MASK);
 
   mark_dirty(svm-vmcb, VMCB_CR);
 
-  if (gcr0 == *hcr0  svm-vcpu.fpu_active) {
+  if (gcr0 == *hcr0) {
   clr_cr_intercept(svm, INTERCEPT_CR0_READ);
   clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
   } else {
@@ -1568,8 +1564,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned 
long cr0)
   if (!npt_enabled)
   cr0 |= X86_CR0_PG | X86_CR0_WP;
 
-  if (!vcpu-fpu_active)
-  cr0 |= X86_CR0_TS;
   /*
* re-enable caching here because the QEMU bios
* does not do it - this results in some delay at
@@ -1795,7 +1789,6 @@ static void svm_fpu_activate(struct kvm_vcpu *vcpu)
 
   clr_exception_intercept(svm, NM_VECTOR);
 
-  svm-vcpu.fpu_active = 1;
   update_cr0_intercept(svm);
 }
 
@@ -4139,14 +4132,6 @@ static bool svm_has_wbinvd_exit(void)
   return true;
 }
 
-static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
-{
-  struct vcpu_svm *svm = to_svm(vcpu);
-
-  set_exception_intercept(svm, NM_VECTOR);
-  update_cr0_intercept(svm);
-}

Do you test it on AMD cpu? What's the performance you get?

Regards,
Wanpeng Li 

-
 #define PRE_EX(exit)  { .exit_code = (exit), \
   .stage = X86_ICPT_PRE_EXCEPT, }
 #define POST_EX(exit) { .exit_code = (exit), \
@@ -4381,7 +4366,6 @@ static struct kvm_x86_ops svm_x86_ops = {
   .cache_reg = svm_cache_reg,
   .get_rflags = svm_get_rflags,
   .set_rflags = svm_set_rflags,
-  .fpu_deactivate = svm_fpu_deactivate,
 
   .tlb_flush = svm_flush_tlb,
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f5e8dce..811a666 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1567,7 +1567,7 @@ static void update_exception_bitmap(struct kvm_vcpu 
*vcpu)
   u32 eb;
 
   eb = (1u  PF_VECTOR) | (1u  UD_VECTOR) | (1u  MC_VECTOR) |
-   (1u  NM_VECTOR) | (1u  DB_VECTOR);
+   (1u  DB_VECTOR);
   if ((vcpu-guest_debug 
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
   (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
@@ -1576,8 +1576,6 @@ static void update_exception_bitmap(struct kvm_vcpu 
*vcpu)
   eb = ~0;
   if (enable_ept)
   eb = ~(1u  PF_VECTOR); /* bypass_guest_pf = 0 */
-  if (vcpu-fpu_active)
-  eb = ~(1u  NM_VECTOR);
 
   /* When we are running a nested L2 guest and L1 specified for it a
* certain exception bitmap, we must trap the same exceptions and pass
@@ -1961,9 +1959,6 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
 {
   ulong cr0;
 
-  if (vcpu-fpu_active)
-  return;
-  vcpu-fpu_active = 1;
   cr0 = vmcs_readl(GUEST_CR0);
   cr0 = ~(X86_CR0_TS | X86_CR0_MP);
   cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
@@ -1994,33 +1989,6 @@ static inline unsigned long 

[PATCH 2/3] KVM: PPC: Book3S HV: Fix bug in dirty page tracking

2015-04-23 Thread Paul Mackerras
This fixes a bug in the tracking of pages that get modified by the
guest.  If the guest creates a large-page HPTE, writes to memory
somewhere within the large page, and then removes the HPTE, we only
record the modified state for the first normal page within the large
page, when in fact the guest might have modified some other normal
page within the large page.

To fix this we use some unused bits in the rmap entry to record the
order (log base 2) of the size of the page that was modified, when
removing an HPTE.  Then in kvm_test_clear_dirty_npages() we use that
order to return the correct number of modified pages.

The same thing could in principle happen when removing a HPTE at the
host's request, i.e. when paging out a page, except that we never
page out large pages, and the guest can only create large-page HPTEs
if the guest RAM is backed by large pages.  However, we also fix
this case for the sake of future-proofing.

The reference bit is also subject to the same loss of information.  We
don't make the same fix here for the reference bit because there isn't
an interface for userspace to find out which pages the guest has
referenced, whereas there is one for userspace to find out which pages
the guest has modified.  Because of this loss of information, the
kvm_age_hva_hv() and kvm_test_age_hva_hv() functions might incorrectly
say that a page has not been referenced when it has, but that doesn't
matter greatly because we never page or swap out large pages.

Signed-off-by: Paul Mackerras pau...@samba.org
---
 arch/powerpc/include/asm/kvm_book3s.h |  1 +
 arch/powerpc/include/asm/kvm_host.h   |  2 ++
 arch/powerpc/kvm/book3s_64_mmu_hv.c   |  8 +++-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c   | 17 +
 4 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 578e550..9b072a5 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -162,6 +162,7 @@ extern pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t 
gpa, bool writing,
bool *writable);
 extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
unsigned long *rmap, long pte_index, int realmode);
+extern void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long 
psize);
 extern void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
unsigned long pte_index);
 void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep,
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index d67a838..9c2617e 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -205,8 +205,10 @@ struct revmap_entry {
  */
 #define KVMPPC_RMAP_LOCK_BIT   63
 #define KVMPPC_RMAP_RC_SHIFT   32
+#define KVMPPC_RMAP_CHG_SHIFT  48
 #define KVMPPC_RMAP_REFERENCED (HPTE_R_R  KVMPPC_RMAP_RC_SHIFT)
 #define KVMPPC_RMAP_CHANGED(HPTE_R_C  KVMPPC_RMAP_RC_SHIFT)
+#define KVMPPC_RMAP_CHG_ORDER  (0x3ful  KVMPPC_RMAP_CHG_SHIFT)
 #define KVMPPC_RMAP_PRESENT0x1ul
 #define KVMPPC_RMAP_INDEX  0xul
 
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index d6fe308..c9c25af 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -763,6 +763,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long 
*rmapp,
/* Harvest R and C */
rcbits = be64_to_cpu(hptep[1])  (HPTE_R_R | HPTE_R_C);
*rmapp |= rcbits  KVMPPC_RMAP_RC_SHIFT;
+   if (rcbits  HPTE_R_C)
+   kvmppc_update_rmap_change(rmapp, psize);
if (rcbits  ~rev[i].guest_rpte) {
rev[i].guest_rpte = ptel | rcbits;
note_hpte_modification(kvm, rev[i]);
@@ -929,8 +931,12 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, 
unsigned long *rmapp)
  retry:
lock_rmap(rmapp);
if (*rmapp  KVMPPC_RMAP_CHANGED) {
-   *rmapp = ~KVMPPC_RMAP_CHANGED;
+   long change_order = (*rmapp  KVMPPC_RMAP_CHG_ORDER)
+KVMPPC_RMAP_CHG_SHIFT;
+   *rmapp = ~(KVMPPC_RMAP_CHANGED | KVMPPC_RMAP_CHG_ORDER);
npages_dirty = 1;
+   if (change_order  PAGE_SHIFT)
+   npages_dirty = 1ul  (change_order - PAGE_SHIFT);
}
if (!(*rmapp  KVMPPC_RMAP_PRESENT)) {
unlock_rmap(rmapp);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 5c1737f..24ccc79 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -12,6 +12,7 @@
 #include linux/kvm_host.h
 #include linux/hugetlb.h
 #include linux/module.h
+#include linux/log2.h
 
 #include 

[PATCH 3/3] KVM: PPC: Book3S HV: Implement H_CLEAR_REF and H_CLEAR_MOD

2015-04-23 Thread Paul Mackerras
This adds implementations for the H_CLEAR_REF (test and clear reference
bit) and H_CLEAR_MOD (test and clear changed bit) hypercalls.

When clearing the reference or change bit in the guest view of the HPTE,
we also have to clear it in the real HPTE so that we can detect future
references or changes.  When we do so, we transfer the R or C bit value
to the rmap entry for the underlying host page so that kvm_age_hva_hv(),
kvm_test_age_hva_hv() and kvmppc_hv_get_dirty_log() know that the page
has been referenced and/or changed.

These hypercalls are not used by Linux guests and these implementations
are only compile tested.

Signed-off-by: Paul Mackerras pau...@samba.org
---
 arch/powerpc/kvm/book3s_hv_rm_mmu.c | 126 ++--
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |   4 +-
 2 files changed, 121 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 24ccc79..479ff7e 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -109,25 +109,38 @@ void kvmppc_update_rmap_change(unsigned long *rmap, 
unsigned long psize)
 }
 EXPORT_SYMBOL_GPL(kvmppc_update_rmap_change);
 
+/* Returns a pointer to the revmap entry for the page mapped by a HPTE */
+static unsigned long *revmap_for_hpte(struct kvm *kvm, unsigned long hpte_v,
+ unsigned long hpte_gr)
+{
+   struct kvm_memory_slot *memslot;
+   unsigned long *rmap;
+   unsigned long gfn;
+
+   gfn = hpte_rpn(hpte_gr, hpte_page_size(hpte_v, hpte_gr));
+   memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
+   if (!memslot)
+   return NULL;
+
+   rmap = real_vmalloc_addr(memslot-arch.rmap[gfn - memslot-base_gfn]);
+   return rmap;
+}
+
 /* Remove this HPTE from the chain for a real page */
 static void remove_revmap_chain(struct kvm *kvm, long pte_index,
struct revmap_entry *rev,
unsigned long hpte_v, unsigned long hpte_r)
 {
struct revmap_entry *next, *prev;
-   unsigned long gfn, ptel, head;
-   struct kvm_memory_slot *memslot;
+   unsigned long ptel, head;
unsigned long *rmap;
unsigned long rcbits;
 
rcbits = hpte_r  (HPTE_R_R | HPTE_R_C);
ptel = rev-guest_rpte |= rcbits;
-   gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
-   memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
-   if (!memslot)
+   rmap = revmap_for_hpte(kvm, hpte_v, ptel);
+   if (!rmap)
return;
-
-   rmap = real_vmalloc_addr(memslot-arch.rmap[gfn - memslot-base_gfn]);
lock_rmap(rmap);
 
head = *rmap  KVMPPC_RMAP_INDEX;
@@ -662,6 +675,105 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long 
flags,
return H_SUCCESS;
 }
 
+long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
+   unsigned long pte_index)
+{
+   struct kvm *kvm = vcpu-kvm;
+   __be64 *hpte;
+   unsigned long v, r, gr;
+   struct revmap_entry *rev;
+   unsigned long *rmap;
+   long ret = H_NOT_FOUND;
+
+   if (pte_index = kvm-arch.hpt_npte)
+   return H_PARAMETER;
+
+   rev = real_vmalloc_addr(kvm-arch.revmap[pte_index]);
+   hpte = (__be64 *)(kvm-arch.hpt_virt + (pte_index  4));
+   while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+   cpu_relax();
+   v = be64_to_cpu(hpte[0]);
+   r = be64_to_cpu(hpte[1]);
+   if (!(v  (HPTE_V_VALID | HPTE_V_ABSENT)))
+   goto out;
+
+   gr = rev-guest_rpte;
+   if (rev-guest_rpte  HPTE_R_R) {
+   rev-guest_rpte = ~HPTE_R_R;
+   note_hpte_modification(kvm, rev);
+   }
+   if (v  HPTE_V_VALID) {
+   gr |= r  (HPTE_R_R | HPTE_R_C);
+   if (r  HPTE_R_R) {
+   kvmppc_clear_ref_hpte(kvm, hpte, pte_index);
+   rmap = revmap_for_hpte(kvm, v, gr);
+   if (rmap) {
+   lock_rmap(rmap);
+   *rmap |= KVMPPC_RMAP_REFERENCED;
+   unlock_rmap(rmap);
+   }
+   }
+   }
+   vcpu-arch.gpr[4] = gr;
+   ret = H_SUCCESS;
+ out:
+   unlock_hpte(hpte, v  ~HPTE_V_HVLOCK);
+   return ret;
+}
+
+long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
+   unsigned long pte_index)
+{
+   struct kvm *kvm = vcpu-kvm;
+   __be64 *hpte;
+   unsigned long v, r, gr;
+   struct revmap_entry *rev;
+   unsigned long *rmap;
+   long ret = H_NOT_FOUND;
+
+   if (pte_index = kvm-arch.hpt_npte)
+   return H_PARAMETER;
+
+   rev = real_vmalloc_addr(kvm-arch.revmap[pte_index]);
+   hpte = (__be64 *)(kvm-arch.hpt_virt + (pte_index  4));
+   while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+

[PATCH 2/3] KVM: PPC: Book3S HV: Fix bug in dirty page tracking

2015-04-23 Thread Paul Mackerras
This fixes a bug in the tracking of pages that get modified by the
guest.  If the guest creates a large-page HPTE, writes to memory
somewhere within the large page, and then removes the HPTE, we only
record the modified state for the first normal page within the large
page, when in fact the guest might have modified some other normal
page within the large page.

To fix this we use some unused bits in the rmap entry to record the
order (log base 2) of the size of the page that was modified, when
removing an HPTE.  Then in kvm_test_clear_dirty_npages() we use that
order to return the correct number of modified pages.

The same thing could in principle happen when removing a HPTE at the
host's request, i.e. when paging out a page, except that we never
page out large pages, and the guest can only create large-page HPTEs
if the guest RAM is backed by large pages.  However, we also fix
this case for the sake of future-proofing.

The reference bit is also subject to the same loss of information.  We
don't make the same fix here for the reference bit because there isn't
an interface for userspace to find out which pages the guest has
referenced, whereas there is one for userspace to find out which pages
the guest has modified.  Because of this loss of information, the
kvm_age_hva_hv() and kvm_test_age_hva_hv() functions might incorrectly
say that a page has not been referenced when it has, but that doesn't
matter greatly because we never page or swap out large pages.

Signed-off-by: Paul Mackerras pau...@samba.org
---
 arch/powerpc/include/asm/kvm_book3s.h |  1 +
 arch/powerpc/include/asm/kvm_host.h   |  2 ++
 arch/powerpc/kvm/book3s_64_mmu_hv.c   |  8 +++-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c   | 17 +
 4 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 578e550..9b072a5 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -162,6 +162,7 @@ extern pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t 
gpa, bool writing,
bool *writable);
 extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
unsigned long *rmap, long pte_index, int realmode);
+extern void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long 
psize);
 extern void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
unsigned long pte_index);
 void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep,
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index d67a838..9c2617e 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -205,8 +205,10 @@ struct revmap_entry {
  */
 #define KVMPPC_RMAP_LOCK_BIT   63
 #define KVMPPC_RMAP_RC_SHIFT   32
+#define KVMPPC_RMAP_CHG_SHIFT  48
 #define KVMPPC_RMAP_REFERENCED (HPTE_R_R  KVMPPC_RMAP_RC_SHIFT)
 #define KVMPPC_RMAP_CHANGED(HPTE_R_C  KVMPPC_RMAP_RC_SHIFT)
+#define KVMPPC_RMAP_CHG_ORDER  (0x3ful  KVMPPC_RMAP_CHG_SHIFT)
 #define KVMPPC_RMAP_PRESENT0x1ul
 #define KVMPPC_RMAP_INDEX  0xul
 
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index d6fe308..c9c25af 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -763,6 +763,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long 
*rmapp,
/* Harvest R and C */
rcbits = be64_to_cpu(hptep[1])  (HPTE_R_R | HPTE_R_C);
*rmapp |= rcbits  KVMPPC_RMAP_RC_SHIFT;
+   if (rcbits  HPTE_R_C)
+   kvmppc_update_rmap_change(rmapp, psize);
if (rcbits  ~rev[i].guest_rpte) {
rev[i].guest_rpte = ptel | rcbits;
note_hpte_modification(kvm, rev[i]);
@@ -929,8 +931,12 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, 
unsigned long *rmapp)
  retry:
lock_rmap(rmapp);
if (*rmapp  KVMPPC_RMAP_CHANGED) {
-   *rmapp = ~KVMPPC_RMAP_CHANGED;
+   long change_order = (*rmapp  KVMPPC_RMAP_CHG_ORDER)
+KVMPPC_RMAP_CHG_SHIFT;
+   *rmapp = ~(KVMPPC_RMAP_CHANGED | KVMPPC_RMAP_CHG_ORDER);
npages_dirty = 1;
+   if (change_order  PAGE_SHIFT)
+   npages_dirty = 1ul  (change_order - PAGE_SHIFT);
}
if (!(*rmapp  KVMPPC_RMAP_PRESENT)) {
unlock_rmap(rmapp);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 5c1737f..24ccc79 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -12,6 +12,7 @@
 #include linux/kvm_host.h
 #include linux/hugetlb.h
 #include linux/module.h
+#include linux/log2.h
 
 #include 

[PATCH 1/3] KVM: PPC: Book3S HV: Fix race in reading change bit when removing HPTE

2015-04-23 Thread Paul Mackerras
The reference (R) and change (C) bits in a HPT entry can be set by
hardware at any time up until the HPTE is invalidated and the TLB
invalidation sequence has completed.  This means that when removing
a HPTE, we need to read the HPTE after the invalidation sequence has
completed in order to obtain reliable values of R and C.  The code
in kvmppc_do_h_remove() used to do this.  However, commit 6f22bd3265fb
(KVM: PPC: Book3S HV: Make HTAB code LE host aware) removed the
read after invalidation as a side effect of other changes.  This
restores the read of the HPTE after invalidation.

The user-visible effect of this bug would be that when migrating a
guest, there is a small probability that a page modified by the guest
and then unmapped by the guest might not get re-transmitted and thus
the destination might end up with a stale copy of the page.

Fixes: 6f22bd3265fb
Cc: sta...@vger.kernel.org # v3.17+
Signed-off-by: Paul Mackerras pau...@samba.org
---
 arch/powerpc/kvm/book3s_hv_rm_mmu.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index f6bf0b1..5c1737f 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -413,14 +413,12 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long 
flags,
rev = real_vmalloc_addr(kvm-arch.revmap[pte_index]);
v = pte  ~HPTE_V_HVLOCK;
if (v  HPTE_V_VALID) {
-   u64 pte1;
-
-   pte1 = be64_to_cpu(hpte[1]);
hpte[0] = ~cpu_to_be64(HPTE_V_VALID);
-   rb = compute_tlbie_rb(v, pte1, pte_index);
+   rb = compute_tlbie_rb(v, be64_to_cpu(hpte[1]), pte_index);
do_tlbies(kvm, rb, 1, global_invalidates(kvm, flags), true);
/* Read PTE low word after tlbie to get final R/C values */
-   remove_revmap_chain(kvm, pte_index, rev, v, pte1);
+   remove_revmap_chain(kvm, pte_index, rev, v,
+   be64_to_cpu(hpte[1]));
}
r = rev-guest_rpte  ~HPTE_GR_RESERVED;
note_hpte_modification(kvm, rev);
-- 
2.1.4

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] KVM: PPC: Book3S HV: Fix race in reading change bit when removing HPTE

2015-04-23 Thread Paul Mackerras
The reference (R) and change (C) bits in a HPT entry can be set by
hardware at any time up until the HPTE is invalidated and the TLB
invalidation sequence has completed.  This means that when removing
a HPTE, we need to read the HPTE after the invalidation sequence has
completed in order to obtain reliable values of R and C.  The code
in kvmppc_do_h_remove() used to do this.  However, commit 6f22bd3265fb
(KVM: PPC: Book3S HV: Make HTAB code LE host aware) removed the
read after invalidation as a side effect of other changes.  This
restores the read of the HPTE after invalidation.

The user-visible effect of this bug would be that when migrating a
guest, there is a small probability that a page modified by the guest
and then unmapped by the guest might not get re-transmitted and thus
the destination might end up with a stale copy of the page.

Fixes: 6f22bd3265fb
Cc: sta...@vger.kernel.org # v3.17+
Signed-off-by: Paul Mackerras pau...@samba.org
---
 arch/powerpc/kvm/book3s_hv_rm_mmu.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index f6bf0b1..5c1737f 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -413,14 +413,12 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long 
flags,
rev = real_vmalloc_addr(kvm-arch.revmap[pte_index]);
v = pte  ~HPTE_V_HVLOCK;
if (v  HPTE_V_VALID) {
-   u64 pte1;
-
-   pte1 = be64_to_cpu(hpte[1]);
hpte[0] = ~cpu_to_be64(HPTE_V_VALID);
-   rb = compute_tlbie_rb(v, pte1, pte_index);
+   rb = compute_tlbie_rb(v, be64_to_cpu(hpte[1]), pte_index);
do_tlbies(kvm, rb, 1, global_invalidates(kvm, flags), true);
/* Read PTE low word after tlbie to get final R/C values */
-   remove_revmap_chain(kvm, pte_index, rev, v, pte1);
+   remove_revmap_chain(kvm, pte_index, rev, v,
+   be64_to_cpu(hpte[1]));
}
r = rev-guest_rpte  ~HPTE_GR_RESERVED;
note_hpte_modification(kvm, rev);
-- 
2.1.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] KVM: PPC: Book3S HV: Implement H_CLEAR_REF and H_CLEAR_MOD

2015-04-23 Thread Paul Mackerras
This adds implementations for the H_CLEAR_REF (test and clear reference
bit) and H_CLEAR_MOD (test and clear changed bit) hypercalls.

When clearing the reference or change bit in the guest view of the HPTE,
we also have to clear it in the real HPTE so that we can detect future
references or changes.  When we do so, we transfer the R or C bit value
to the rmap entry for the underlying host page so that kvm_age_hva_hv(),
kvm_test_age_hva_hv() and kvmppc_hv_get_dirty_log() know that the page
has been referenced and/or changed.

These hypercalls are not used by Linux guests and these implementations
are only compile tested.

Signed-off-by: Paul Mackerras pau...@samba.org
---
 arch/powerpc/kvm/book3s_hv_rm_mmu.c | 126 ++--
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |   4 +-
 2 files changed, 121 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 24ccc79..479ff7e 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -109,25 +109,38 @@ void kvmppc_update_rmap_change(unsigned long *rmap, 
unsigned long psize)
 }
 EXPORT_SYMBOL_GPL(kvmppc_update_rmap_change);
 
+/* Returns a pointer to the revmap entry for the page mapped by a HPTE */
+static unsigned long *revmap_for_hpte(struct kvm *kvm, unsigned long hpte_v,
+ unsigned long hpte_gr)
+{
+   struct kvm_memory_slot *memslot;
+   unsigned long *rmap;
+   unsigned long gfn;
+
+   gfn = hpte_rpn(hpte_gr, hpte_page_size(hpte_v, hpte_gr));
+   memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
+   if (!memslot)
+   return NULL;
+
+   rmap = real_vmalloc_addr(memslot-arch.rmap[gfn - memslot-base_gfn]);
+   return rmap;
+}
+
 /* Remove this HPTE from the chain for a real page */
 static void remove_revmap_chain(struct kvm *kvm, long pte_index,
struct revmap_entry *rev,
unsigned long hpte_v, unsigned long hpte_r)
 {
struct revmap_entry *next, *prev;
-   unsigned long gfn, ptel, head;
-   struct kvm_memory_slot *memslot;
+   unsigned long ptel, head;
unsigned long *rmap;
unsigned long rcbits;
 
rcbits = hpte_r  (HPTE_R_R | HPTE_R_C);
ptel = rev-guest_rpte |= rcbits;
-   gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
-   memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
-   if (!memslot)
+   rmap = revmap_for_hpte(kvm, hpte_v, ptel);
+   if (!rmap)
return;
-
-   rmap = real_vmalloc_addr(memslot-arch.rmap[gfn - memslot-base_gfn]);
lock_rmap(rmap);
 
head = *rmap  KVMPPC_RMAP_INDEX;
@@ -662,6 +675,105 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long 
flags,
return H_SUCCESS;
 }
 
+long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
+   unsigned long pte_index)
+{
+   struct kvm *kvm = vcpu-kvm;
+   __be64 *hpte;
+   unsigned long v, r, gr;
+   struct revmap_entry *rev;
+   unsigned long *rmap;
+   long ret = H_NOT_FOUND;
+
+   if (pte_index = kvm-arch.hpt_npte)
+   return H_PARAMETER;
+
+   rev = real_vmalloc_addr(kvm-arch.revmap[pte_index]);
+   hpte = (__be64 *)(kvm-arch.hpt_virt + (pte_index  4));
+   while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+   cpu_relax();
+   v = be64_to_cpu(hpte[0]);
+   r = be64_to_cpu(hpte[1]);
+   if (!(v  (HPTE_V_VALID | HPTE_V_ABSENT)))
+   goto out;
+
+   gr = rev-guest_rpte;
+   if (rev-guest_rpte  HPTE_R_R) {
+   rev-guest_rpte = ~HPTE_R_R;
+   note_hpte_modification(kvm, rev);
+   }
+   if (v  HPTE_V_VALID) {
+   gr |= r  (HPTE_R_R | HPTE_R_C);
+   if (r  HPTE_R_R) {
+   kvmppc_clear_ref_hpte(kvm, hpte, pte_index);
+   rmap = revmap_for_hpte(kvm, v, gr);
+   if (rmap) {
+   lock_rmap(rmap);
+   *rmap |= KVMPPC_RMAP_REFERENCED;
+   unlock_rmap(rmap);
+   }
+   }
+   }
+   vcpu-arch.gpr[4] = gr;
+   ret = H_SUCCESS;
+ out:
+   unlock_hpte(hpte, v  ~HPTE_V_HVLOCK);
+   return ret;
+}
+
+long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
+   unsigned long pte_index)
+{
+   struct kvm *kvm = vcpu-kvm;
+   __be64 *hpte;
+   unsigned long v, r, gr;
+   struct revmap_entry *rev;
+   unsigned long *rmap;
+   long ret = H_NOT_FOUND;
+
+   if (pte_index = kvm-arch.hpt_npte)
+   return H_PARAMETER;
+
+   rev = real_vmalloc_addr(kvm-arch.revmap[pte_index]);
+   hpte = (__be64 *)(kvm-arch.hpt_virt + (pte_index  4));
+   while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+

[PATCH 0/3] PPC HV bug fixes + hcalls for FreeBSD

2015-04-23 Thread Paul Mackerras
The main purpose of this series is to implement the H_CLEAR_REF and
H_CLEAR_MOD hypercalls defined by PAPR.  We are doing this for the
sake of FreeBSD guests as Linux guests don't use them.  Along the way
I found a couple of bugs, so the fixes for those are split out as the
first two patches.

The first two patches could go in immediately.  I'd like to get
feedback from actual users of H_CLEAR_REF/MOD before the third patch
goes in.

These patches are against Alex Graf's kvm-ppc-queue branch.

Paul.

 arch/powerpc/include/asm/kvm_host.h |  2 ++
 arch/powerpc/kernel/asm-offsets.c   |  3 ++
 arch/powerpc/kvm/book3s_64_mmu_hv.c |  6 +++-
 arch/powerpc/kvm/book3s_hv.c| 51 ++---
 arch/powerpc/kvm/book3s_hv_builtin.c| 16 +--
 arch/powerpc/kvm/book3s_hv_rm_mmu.c | 26 +
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 22 --
 7 files changed, 98 insertions(+), 28 deletions(-)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [v6] kvm/fpu: Enable fully eager restore kvm FPU

2015-04-23 Thread Rik van Riel
On 04/23/2015 06:57 PM, Wanpeng Li wrote:
 Cc Rik, who is doing the similar work. :)

Hi Liang,

I posted this patch earlier, which should have the same effect as
your patch on more modern systems, while not loading the FPU context
for guests that barely use it on older systems:

https://lkml.org/lkml/2015/4/23/349

I have to admit the diffstat on your patch looks very nice, but
it might be good to know what impact it has on older systems...

 On Fri, Apr 24, 2015 at 05:13:03AM +0800, Liang Li wrote:
 Romove lazy FPU logic and use eager FPU entirely. Eager FPU does
 not have performance regression, and it can simplify the code.

 When compiling kernel on westmere, the performance of eager FPU
 is about 0.4% faster than lazy FPU.

 Signed-off-by: Liang Li liang.z...@intel.com
 Signed-off-by: Xudong Hao xudong@intel.com
 ---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/svm.c  | 22 ++--
 arch/x86/kvm/vmx.c  | 74 
 +++--
 arch/x86/kvm/x86.c  |  8 +
 include/linux/kvm_host.h|  2 --
 5 files changed, 9 insertions(+), 98 deletions(-)

 diff --git a/arch/x86/include/asm/kvm_host.h 
 b/arch/x86/include/asm/kvm_host.h
 index dea2e7e..5d84cc9 100644
 --- a/arch/x86/include/asm/kvm_host.h
 +++ b/arch/x86/include/asm/kvm_host.h
 @@ -743,7 +743,6 @@ struct kvm_x86_ops {
  void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
  unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
  void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
 -void (*fpu_deactivate)(struct kvm_vcpu *vcpu);

  void (*tlb_flush)(struct kvm_vcpu *vcpu);

 diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
 index ce741b8..1b3b29b 100644
 --- a/arch/x86/kvm/svm.c
 +++ b/arch/x86/kvm/svm.c
 @@ -1087,7 +1087,6 @@ static void init_vmcb(struct vcpu_svm *svm)
  struct vmcb_control_area *control = svm-vmcb-control;
  struct vmcb_save_area *save = svm-vmcb-save;

 -svm-vcpu.fpu_active = 1;
  svm-vcpu.arch.hflags = 0;

  set_cr_intercept(svm, INTERCEPT_CR0_READ);
 @@ -1529,15 +1528,12 @@ static void update_cr0_intercept(struct vcpu_svm 
 *svm)
  ulong gcr0 = svm-vcpu.arch.cr0;
  u64 *hcr0 = svm-vmcb-save.cr0;

 -if (!svm-vcpu.fpu_active)
 -*hcr0 |= SVM_CR0_SELECTIVE_MASK;
 -else
 -*hcr0 = (*hcr0  ~SVM_CR0_SELECTIVE_MASK)
 -| (gcr0  SVM_CR0_SELECTIVE_MASK);
 +*hcr0 = (*hcr0  ~SVM_CR0_SELECTIVE_MASK)
 +| (gcr0  SVM_CR0_SELECTIVE_MASK);

  mark_dirty(svm-vmcb, VMCB_CR);

 -if (gcr0 == *hcr0  svm-vcpu.fpu_active) {
 +if (gcr0 == *hcr0) {
  clr_cr_intercept(svm, INTERCEPT_CR0_READ);
  clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
  } else {
 @@ -1568,8 +1564,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, 
 unsigned long cr0)
  if (!npt_enabled)
  cr0 |= X86_CR0_PG | X86_CR0_WP;

 -if (!vcpu-fpu_active)
 -cr0 |= X86_CR0_TS;
  /*
   * re-enable caching here because the QEMU bios
   * does not do it - this results in some delay at
 @@ -1795,7 +1789,6 @@ static void svm_fpu_activate(struct kvm_vcpu *vcpu)

  clr_exception_intercept(svm, NM_VECTOR);

 -svm-vcpu.fpu_active = 1;
  update_cr0_intercept(svm);
 }

 @@ -4139,14 +4132,6 @@ static bool svm_has_wbinvd_exit(void)
  return true;
 }

 -static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
 -{
 -struct vcpu_svm *svm = to_svm(vcpu);
 -
 -set_exception_intercept(svm, NM_VECTOR);
 -update_cr0_intercept(svm);
 -}
 -
 #define PRE_EX(exit)  { .exit_code = (exit), \
  .stage = X86_ICPT_PRE_EXCEPT, }
 #define POST_EX(exit) { .exit_code = (exit), \
 @@ -4381,7 +4366,6 @@ static struct kvm_x86_ops svm_x86_ops = {
  .cache_reg = svm_cache_reg,
  .get_rflags = svm_get_rflags,
  .set_rflags = svm_set_rflags,
 -.fpu_deactivate = svm_fpu_deactivate,

  .tlb_flush = svm_flush_tlb,

 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index f5e8dce..811a666 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -1567,7 +1567,7 @@ static void update_exception_bitmap(struct kvm_vcpu 
 *vcpu)
  u32 eb;

  eb = (1u  PF_VECTOR) | (1u  UD_VECTOR) | (1u  MC_VECTOR) |
 - (1u  NM_VECTOR) | (1u  DB_VECTOR);
 + (1u  DB_VECTOR);
  if ((vcpu-guest_debug 
   (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
  (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
 @@ -1576,8 +1576,6 @@ static void update_exception_bitmap(struct kvm_vcpu 
 *vcpu)
  eb = ~0;
  if (enable_ept)
  eb = ~(1u  PF_VECTOR); /* bypass_guest_pf = 0 */
 -if (vcpu-fpu_active)
 -eb = ~(1u  NM_VECTOR);

  /* When we are running a nested L2 guest and L1 specified for it a
   * certain exception bitmap, we must trap the same exceptions and pass
 @@ -1961,9 +1959,6 @@ static void