Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Paolo Bonzini
Il 18/09/2014 23:54, David Hepkin ha scritto:
 The chief advantage I see to using a hypercall based mechanism is
 that it would work across more architectures.  MSR's and CPUID's are
 specific to X86.  If we ever wanted this same mechanism to be
 available on an architecture that doesn't support MSR's,  a hypercall
 based approach would allow for a more consistent mechanism across the
 architectures.
 
 I agree, though, that converging on a common hypercall interface that
 would be implemented by all of the hypervisors would likely be much
 harder to achieve.

There are differences between architectures at the hypercall level,
starting with the calling convention.  So I don't think it makes much
sense to use a hypercall.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] kvm: Faults which trigger IO release the mmap_sem

2014-09-19 Thread Paolo Bonzini
Il 19/09/2014 05:58, Andres Lagar-Cavilla ha scritto:
 Paolo, should I recut including the recent Reviewed-by's?

No, I'll add them myself.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 84781] The guest will hang after live migration.

2014-09-19 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=84781

Paolo Bonzini bonz...@gnu.org changed:

   What|Removed |Added

 Status|NEW |RESOLVED
 Resolution|--- |CODE_FIX

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 2/2] KVM: x86: directly use kvm_make_request again

2014-09-19 Thread Xiao Guangrong
On 09/19/2014 12:38 AM, Liang Chen wrote:
 A one-line wrapper around kvm_make_request does not seem
 particularly useful. Replace kvm_mmu_flush_tlb() with
 kvm_make_request() again to free the namespace a bit.
 
 Signed-off-by: Liang Chen liangchen.li...@gmail.com
 ---
  arch/x86/include/asm/kvm_host.h |  1 -
  arch/x86/kvm/mmu.c  | 16 +---
  arch/x86/kvm/vmx.c  |  2 +-
  arch/x86/kvm/x86.c  | 11 ---
  4 files changed, 14 insertions(+), 16 deletions(-)
 
 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
 index 7c492ed..77ade89 100644
 --- a/arch/x86/include/asm/kvm_host.h
 +++ b/arch/x86/include/asm/kvm_host.h
 @@ -917,7 +917,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
  int fx_init(struct kvm_vcpu *vcpu);
 
 -void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
  void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
  const u8 *new, int bytes);
  int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
 diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
 index b41fd97..acc2d0c5 100644
 --- a/arch/x86/kvm/mmu.c
 +++ b/arch/x86/kvm/mmu.c
 @@ -1749,7 +1749,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, 
 struct kvm_mmu_page *sp,
   return 1;
   }
 
 - kvm_mmu_flush_tlb(vcpu);
 + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
   return 0;
  }
 
 @@ -1802,7 +1802,7 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu,  
 gfn_t gfn)
 
   kvm_mmu_commit_zap_page(vcpu-kvm, invalid_list);
   if (flush)
 - kvm_mmu_flush_tlb(vcpu);
 + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
  }
 
  struct mmu_page_path {
 @@ -2536,7 +2536,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 
 *sptep,
 true, host_writable)) {
   if (write_fault)
   *emulate = 1;
 - kvm_mmu_flush_tlb(vcpu);
 + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
   }
 
   if (unlikely(is_mmio_spte(*sptep)  emulate))
 @@ -3450,12 +3450,6 @@ static void nonpaging_init_context(struct kvm_vcpu 
 *vcpu,
   context-nx = false;
  }
 
 -void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 -{
 - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 -}
 -EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
 -
  void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu)
  {
   mmu_free_roots(vcpu);
 @@ -3961,7 +3955,7 @@ static void mmu_pte_write_flush_tlb(struct kvm_vcpu 
 *vcpu, bool zap_page,
   if (remote_flush)
   kvm_flush_remote_tlbs(vcpu-kvm);
   else if (local_flush)
 - kvm_mmu_flush_tlb(vcpu);
 + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
  }
 
  static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
 @@ -4222,7 +4216,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
  void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
  {
   vcpu-arch.mmu.invlpg(vcpu, gva);
 - kvm_mmu_flush_tlb(vcpu);
 + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
   ++vcpu-stat.invlpg;
  }
  EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index bfe11cf..bb0a7ab 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -6617,7 +6617,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
   switch (type) {
   case VMX_EPT_EXTENT_GLOBAL:
   kvm_mmu_sync_roots(vcpu);
 - kvm_mmu_flush_tlb(vcpu);
 + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
   nested_vmx_succeed(vcpu);
   break;
   default:
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index 9eb5458..fc3df50 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -726,7 +726,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
  {
   if (cr3 == kvm_read_cr3(vcpu)  !pdptrs_changed(vcpu)) {
   kvm_mmu_sync_roots(vcpu);
 - kvm_mmu_flush_tlb(vcpu);
 + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
   return 0;
   }
 
 @@ -5989,6 +5989,12 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
   kvm_apic_update_tmr(vcpu, tmr);
  }
 
 +static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
 +{
 + ++vcpu-stat.tlb_flush;
 + kvm_x86_ops-tlb_flush(vcpu);
 +}
 +
  /*
   * Returns 1 to let __vcpu_run() continue the guest execution loop without
   * exiting to the userspace.  Otherwise, the value will be returned to the
 @@ -6018,8 +6024,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
   if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
   kvm_mmu_sync_roots(vcpu);
   if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
 - ++vcpu-stat.tlb_flush;
 - kvm_x86_ops-tlb_flush(vcpu);
 + kvm_vcpu_flush_tlb(vcpu);

NACK!

Do not understand why you have to introduce a meaningful name
here - it's used just inner a function, which can not help to
improve a readability of the 

Re: [PATCH RFC 2/2] vhost: support urgent descriptors

2014-09-19 Thread Jason Wang
On 07/01/2014 06:49 PM, Michael S. Tsirkin wrote:
 Signed-off-by: Michael S. Tsirkin m...@redhat.com
 ---
  drivers/vhost/vhost.h | 19 +--
  drivers/vhost/net.c   | 30 +-
  drivers/vhost/scsi.c  | 23 +++
  drivers/vhost/test.c  |  5 +++--
  drivers/vhost/vhost.c | 23 ---
  5 files changed, 68 insertions(+), 32 deletions(-)

 diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
 index 3eda654..61ca542 100644
 --- a/drivers/vhost/vhost.h
 +++ b/drivers/vhost/vhost.h
[...]
  EXPORT_SYMBOL_GPL(vhost_add_used_n);
 @@ -1433,12 +1439,13 @@ static bool vhost_notify(struct vhost_dev *dev, 
 struct vhost_virtqueue *vq)
   unlikely(vq-avail_idx == vq-last_avail_idx))
   return true;
  
 - if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
 + if (vq-urgent || !vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {

So the urgent descriptor only work when event index was not enabled?
This seems suboptimal, we may still want to benefit from event index
even if urgent descriptor is used. Looks like we need return true here
when vq-urgent is true?

Another question is whether or not we need to do this a new flag.
Technically we can do it purely in guest side through event index, this
can help to eliminate both the changes in host and a new feature bit.
   __u16 flags;
   if (__get_user(flags, vq-avail-flags)) {
   vq_err(vq, Failed to get flags);
   return true;
   }
 + vq-urgent = false;
   return !(flags  VRING_AVAIL_F_NO_INTERRUPT);
   }
   old = vq-signalled_used;
 @@ -1468,9 +1475,10 @@ EXPORT_SYMBOL_GPL(vhost_signal);
  /* And here's the combo meal deal.  Supersize me! */
  void vhost_add_used_and_signal(struct vhost_dev *dev,
  struct vhost_virtqueue *vq,
 +bool urgent,
  unsigned int head, int len)
  {
 - vhost_add_used(vq, head, len);
 + vhost_add_used(vq, urgent, head, len);
   vhost_signal(dev, vq);
  }
  EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
 @@ -1478,9 +1486,10 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
  /* multi-buffer version of vhost_add_used_and_signal */
  void vhost_add_used_and_signal_n(struct vhost_dev *dev,
struct vhost_virtqueue *vq,
 +  bool urgent,
struct vring_used_elem *heads, unsigned count)
  {
 - vhost_add_used_n(vq, heads, count);
 + vhost_add_used_n(vq, urgent, heads, count);
   vhost_signal(dev, vq);
  }
  EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RESEND PATCH 1/3] x86: Adding structs to reflect cpuid fields

2014-09-19 Thread Borislav Petkov
On Thu, Sep 18, 2014 at 03:36:43PM +0200, Paolo Bonzini wrote:
 We're talking about the case where the field is not reserved anymore and
 we _know_ that the vendor has just decided to grow the bitfield that
 precedes it.

We're talking about the case where you assumed that a reserved bit is 0
which is an unsafe assumption, the least.

 As soon as we know that the field is not reserved anymore, we
 obviously rely on reserved bits being zero in older processors, and in
 future processors from other vendors.

Again, this is an unsafe assumption.

 The trivial example is feature bits like XSAVE. We query them all the
 time without checking the family when they were first introduced,
 don't we?

The feature bits would obviously be 0 if features are not supported.

However, even there

16 - Reserved - Do not count on the value.

I'm quoting Intel's CPUID doc 241618-037 from 2011 (there might be a
newer one though), the CPUID(1).ECX description.

-- 
Regards/Gruss,
Boris.
--
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RESEND PATCH 1/3] x86: Adding structs to reflect cpuid fields

2014-09-19 Thread Nadav Amit

On Sep 19, 2014, at 10:58 AM, Borislav Petkov b...@alien8.de wrote:

 On Thu, Sep 18, 2014 at 03:36:43PM +0200, Paolo Bonzini wrote:
 We're talking about the case where the field is not reserved anymore and
 we _know_ that the vendor has just decided to grow the bitfield that
 precedes it.
 
 We're talking about the case where you assumed that a reserved bit is 0
 which is an unsafe assumption, the least.
 
 As soon as we know that the field is not reserved anymore, we
 obviously rely on reserved bits being zero in older processors, and in
 future processors from other vendors.
 
 Again, this is an unsafe assumption.
 
 The trivial example is feature bits like XSAVE. We query them all the
 time without checking the family when they were first introduced,
 don't we?
 
 The feature bits would obviously be 0 if features are not supported.
 
 However, even there
 
 16 - Reserved - Do not count on the value.
 
 I'm quoting Intel's CPUID doc 241618-037 from 2011 (there might be a
 newer one though), the CPUID(1).ECX description.

New fields which which replace reserved bits would be handled the same way with 
bitfields and bit masks.

As for the concern that CPUID fields would be extended into non-zero reserved 
bits - can someone point me to a single case that occurred in the last 20 years 
during which CPUID existed? 

The closest thing I found was “extended family id”, but this field is separate 
than “family id” and treated this way. 

Nadav


signature.asc
Description: Message signed with OpenPGP using GPGMail


Re: [PATCH RFC 2/2] vhost: support urgent descriptors

2014-09-19 Thread Jason Wang
On 07/01/2014 06:49 PM, Michael S. Tsirkin wrote:
 Signed-off-by: Michael S. Tsirkin m...@redhat.com
 ---
  drivers/vhost/vhost.h | 19 +--
  drivers/vhost/net.c   | 30 +-
  drivers/vhost/scsi.c  | 23 +++
  drivers/vhost/test.c  |  5 +++--
  drivers/vhost/vhost.c | 23 ---
  5 files changed, 68 insertions(+), 32 deletions(-)

 diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
 index 3eda654..61ca542 100644
 --- a/drivers/vhost/vhost.h
 +++ b/drivers/vhost/vhost.h
 @@ -96,6 +96,9 @@ struct vhost_virtqueue {
   /* Last used index value we have signalled on */
   bool signalled_used_valid;
  
 + /* Urgent descriptor was used */
 + bool urgent;
 +
   /* Log writes to used structure. */
   bool log_used;
   u64 log_addr;
 @@ -138,20 +141,24 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, 
 void __user *argp);
  int vhost_vq_access_ok(struct vhost_virtqueue *vq);
  int vhost_log_access_ok(struct vhost_dev *);
  
 -int vhost_get_vq_desc(struct vhost_virtqueue *,
 +int vhost_get_vq_desc(struct vhost_virtqueue *, bool *urgent,
 struct iovec iov[], unsigned int iov_count,
 unsigned int *out_num, unsigned int *in_num,
 struct vhost_log *log, unsigned int *log_num);
  void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
  
  int vhost_init_used(struct vhost_virtqueue *);
 -int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
 -int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
 -  unsigned count);
 -void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
 +int vhost_add_used(struct vhost_virtqueue *, bool urgent,
 +unsigned int head, int len);
 +int vhost_add_used_n(struct vhost_virtqueue *, bool urgent,
 +  struct vring_used_elem *heads, unsigned count);
 +void vhost_add_used_and_signal(struct vhost_dev *,
 +struct vhost_virtqueue *,
 +bool urgent,
  unsigned int id, int len);
  void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue 
 *,
 -struct vring_used_elem *heads, unsigned count);
 +  bool urgent,
 +  struct vring_used_elem *heads, unsigned count);
  void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
  void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *);
  bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *);
 diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 index 8dae2f7..5f0567f 100644
 --- a/drivers/vhost/net.c
 +++ b/drivers/vhost/net.c
 @@ -48,9 +48,11 @@ MODULE_PARM_DESC(experimental_zcopytx, Enable Zero Copy 
 TX;
   * status internally; used for zerocopy tx only.
   */
  /* Lower device DMA failed */
 -#define VHOST_DMA_FAILED_LEN 3
 +#define VHOST_DMA_FAILED_LEN 4
  /* Lower device DMA done */
 -#define VHOST_DMA_DONE_LEN   2
 +#define VHOST_DMA_DONE_LEN   3
 +/* Lower device DMA in progress, urgent bit set */
 +#define VHOST_DMA_URGENT 2
  /* Lower device DMA in progress */
  #define VHOST_DMA_IN_PROGRESS1
  /* Buffer unused */
 @@ -284,11 +286,13 @@ static void vhost_zerocopy_signal_used(struct vhost_net 
 *net,
   container_of(vq, struct vhost_net_virtqueue, vq);
   int i, add;
   int j = 0;
 + bool urgent = false;
  
   for (i = nvq-done_idx; i != nvq-upend_idx; i = (i + 1) % UIO_MAXIOV) {
   if (vq-heads[i].len == VHOST_DMA_FAILED_LEN)
   vhost_net_tx_err(net);
   if (VHOST_DMA_IS_DONE(vq-heads[i].len)) {
 + urgent = urgent || vq-heads[i].len == VHOST_DMA_URGENT;

A problem is len was either VHOST_DMA_DONE_LEN or VHOST_DMA_FAILED_LEN
here. Looks like we need another new VHOST_DMA_DONE_LEN_URGENT and
change the len to this value if it was an urgent descriptor in zerocopy
callback.
   vq-heads[i].len = VHOST_DMA_CLEAR_LEN;
   ++j;
   } else
 @@ -296,7 +300,7 @@ static void vhost_zerocopy_signal_used(struct vhost_net 
 *net,
   }
   while (j) {
   add = min(UIO_MAXIOV - nvq-done_idx, j);
 - vhost_add_used_and_signal_n(vq-dev, vq,
 + vhost_add_used_and_signal_n(vq-dev, vq, urgent,
   vq-heads[nvq-done_idx], add);
   nvq-done_idx = (nvq-done_idx + add) % UIO_MAXIOV;
   j -= add;
 @@ -363,6 +367,7 @@ static void handle_tx(struct vhost_net *net)
   zcopy = nvq-ubufs;
  
   for (;;) {
 + bool urgent;
   /* Release DMAs done buffers first */
   if (zcopy)
   vhost_zerocopy_signal_used(net, vq);
 @@ -374,7 +379,7 @@ static void 

[RFC patch 6/6] vfio: make vfio run on s390 platform

2014-09-19 Thread frank . blaschka
From: Frank Blaschka frank.blasc...@de.ibm.com

Following changes are made because of platform differences:

1) s390 does not support mmap'ing of PCI BARs so we have to go via slow path
2) no intx support
3) no classic MSIX interrupts. The pci hw understands the concept
   of requesting MSIX irqs but irqs are delivered as s390 adapter irqs.
   Introduce s390 specific functions for msix notification (slow path) and
   msi routes (kvm fast path).
4) Use type1 iommu but register only for iommu address space

Signed-off-by: Frank Blaschka frank.blasc...@de.ibm.com
---
 hw/misc/vfio.c |   24 
 1 file changed, 24 insertions(+)

--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -40,6 +40,9 @@
 #include sysemu/kvm.h
 #include sysemu/sysemu.h
 #include hw/misc/vfio.h
+#ifdef TARGET_S390X
+#include hw/s390x/s390-pci-bus.h
+#endif
 
 /* #define DEBUG_VFIO */
 #ifdef DEBUG_VFIO
@@ -51,7 +54,11 @@
 #endif
 
 /* Extra debugging, trap acceleration paths for more logging */
+#ifdef TARGET_S390X
+#define VFIO_ALLOW_MMAP 0
+#else
 #define VFIO_ALLOW_MMAP 1
+#endif
 #define VFIO_ALLOW_KVM_INTX 1
 #define VFIO_ALLOW_KVM_MSI 1
 #define VFIO_ALLOW_KVM_MSIX 1
@@ -554,6 +561,10 @@ static int vfio_enable_intx(VFIODevice *
 struct vfio_irq_set *irq_set;
 int32_t *pfd;
 
+#ifdef TARGET_S390X
+return 0;
+#endif
+
 if (!pin) {
 return 0;
 }
@@ -664,7 +675,11 @@ static void vfio_msi_interrupt(void *opa
 #endif
 
 if (vdev-interrupt == VFIO_INT_MSIX) {
+#ifdef TARGET_S390X
+s390_msix_notify(vdev-pdev, nr);
+#else
 msix_notify(vdev-pdev, nr);
+#endif
 } else if (vdev-interrupt == VFIO_INT_MSI) {
 msi_notify(vdev-pdev, nr);
 } else {
@@ -730,7 +745,11 @@ static void vfio_add_kvm_msi_virq(VFIOMS
 return;
 }
 
+#ifdef TARGET_S390X
+virq = s390_irqchip_add_msi_route(vector-vdev-pdev, kvm_state, *msg);
+#else
 virq = kvm_irqchip_add_msi_route(kvm_state, *msg);
+#endif
 if (virq  0) {
 event_notifier_cleanup(vector-kvm_interrupt);
 return;
@@ -3702,8 +3721,13 @@ static int vfio_connect_container(VFIOGr
 container-iommu_data.type1.listener = vfio_memory_listener;
 container-iommu_data.release = vfio_listener_release;
 
+#ifdef TARGET_S390X
+memory_listener_register(container-iommu_data.type1.listener,
+ container-space-as);
+#else
 memory_listener_register(container-iommu_data.type1.listener,
  address_space_memory);
+#endif
 
 if (container-iommu_data.type1.error) {
 ret = container-iommu_data.type1.error;

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC patch 3/6] vfio: make vfio build on s390

2014-09-19 Thread frank . blaschka
From: Frank Blaschka frank.blasc...@de.ibm.com

This patch adds some small changes to make vfio build on s390.

Signed-off-by: Frank Blaschka frank.blasc...@de.ibm.com
---
 drivers/vfio/Kconfig |2 +-
 drivers/vfio/pci/vfio_pci_rdwr.c |8 
 2 files changed, 9 insertions(+), 1 deletion(-)

--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -16,7 +16,7 @@ config VFIO_SPAPR_EEH
 menuconfig VFIO
tristate VFIO Non-Privileged userspace driver framework
depends on IOMMU_API
-   select VFIO_IOMMU_TYPE1 if X86
+   select VFIO_IOMMU_TYPE1 if (X86 || S390)
select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES)
select VFIO_SPAPR_EEH if (PPC_POWERNV || PPC_PSERIES)
select ANON_INODES
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -177,6 +177,13 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_
return done;
 }
 
+#ifdef CONFIG_NO_IOPORT_MAP
+ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
+   size_t count, loff_t *ppos, bool iswrite)
+{
+   return -EINVAL;
+}
+#else
 ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
   size_t count, loff_t *ppos, bool iswrite)
 {
@@ -236,3 +243,4 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_
 
return done;
 }
+#endif

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC patch 0/6] vfio based pci pass-through for qemu/KVM on s390

2014-09-19 Thread frank . blaschka
This set of patches implements a vfio based solution for pci
pass-through on the s390 platform. The kernel stuff is pretty
much straight forward, but qemu needs more work.

Most interesting patch is:
  vfio: make vfio run on s390 platform

I hope Alex  Alex can give me some guidance how to do the changes
in an appropriate way. After creating a separate iommmu address space
for each attached PCI device I can successfully run the vfio type1
iommu. So If we could extend type1 not registering all guest memory
(see patch) I think we do not need a special vfio iommu for s390
for the moment.

The patches implement the base pass-through support. s390 specific
virtualization functions are currently not included. This would
be a second step after the base support is done.

kernel patches apply to linux-kvm-next

KVM: s390: Enable PCI instructions
iommu: add iommu for s390 platform
vfio: make vfio build on s390

qemu patches apply to qemu-master

s390: Add PCI bus support
s390: implement pci instruction
vfio: make vfio run on s390 platform

Thx for feedback and review comments

Frank

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC patch 4/6] s390: Add PCI bus support

2014-09-19 Thread frank . blaschka
From: Frank Blaschka frank.blasc...@de.ibm.com

This patch implements a pci bus for s390x together with some infrastructure
to generate and handle hotplug events. It also provides device 
configuration/unconfiguration via sclp instruction interception.

Signed-off-by: Frank Blaschka frank.blasc...@de.ibm.com
---
 default-configs/s390x-softmmu.mak |1 
 hw/s390x/Makefile.objs|1 
 hw/s390x/css.c|5 
 hw/s390x/css.h|1 
 hw/s390x/s390-pci-bus.c   |  404 ++
 hw/s390x/s390-pci-bus.h   |  166 +++
 hw/s390x/s390-virtio-ccw.c|2 
 hw/s390x/sclp.c   |   10 
 include/hw/s390x/sclp.h   |8 
 target-s390x/ioinst.c |   52 
 target-s390x/ioinst.h |1 
 11 files changed, 650 insertions(+), 1 deletion(-)

--- a/default-configs/s390x-softmmu.mak
+++ b/default-configs/s390x-softmmu.mak
@@ -1,3 +1,4 @@
+include pci.mak
 CONFIG_VIRTIO=y
 CONFIG_SCLPCONSOLE=y
 CONFIG_S390_FLIC=y
--- a/hw/s390x/Makefile.objs
+++ b/hw/s390x/Makefile.objs
@@ -8,3 +8,4 @@ obj-y += ipl.o
 obj-y += css.o
 obj-y += s390-virtio-ccw.o
 obj-y += virtio-ccw.o
+obj-$(CONFIG_KVM) += s390-pci-bus.o
--- a/hw/s390x/css.c
+++ b/hw/s390x/css.c
@@ -1281,6 +1281,11 @@ void css_generate_chp_crws(uint8_t cssid
 /* TODO */
 }
 
+void css_generate_css_crws(uint8_t cssid)
+{
+css_queue_crw(CRW_RSC_CSS, 0, 0, 0);
+}
+
 int css_enable_mcsse(void)
 {
 trace_css_enable_facility(mcsse);
--- a/hw/s390x/css.h
+++ b/hw/s390x/css.h
@@ -99,6 +99,7 @@ void css_queue_crw(uint8_t rsc, uint8_t
 void css_generate_sch_crws(uint8_t cssid, uint8_t ssid, uint16_t schid,
int hotplugged, int add);
 void css_generate_chp_crws(uint8_t cssid, uint8_t chpid);
+void css_generate_css_crws(uint8_t cssid);
 void css_adapter_interrupt(uint8_t isc);
 
 #define CSS_IO_ADAPTER_VIRTIO 1
--- /dev/null
+++ b/hw/s390x/s390-pci-bus.c
@@ -0,0 +1,404 @@
+/*
+ * s390 PCI BUS
+ *
+ * Copyright 2014 IBM Corp.
+ * Author(s): Frank Blaschka frank.blasc...@de.ibm.com
+ *Hong Bo Li lih...@cn.ibm.com
+ *Yi Min Zhao zyi...@cn.ibm.com
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or (at
+ * your option) any later version. See the COPYING file in the top-level
+ * directory.
+ */
+
+#include hw/pci/pci.h
+#include hw/s390x/css.h
+#include hw/s390x/sclp.h
+#include hw/pci/msi.h
+#include qemu/error-report.h
+#include s390-pci-bus.h
+
+/* #define DEBUG_S390PCI_BUS */
+#ifdef DEBUG_S390PCI_BUS
+#define DPRINTF(fmt, ...) \
+do { fprintf(stderr, S390pci-bus:  fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+do { } while (0)
+#endif
+
+static const unsigned long be_to_le = BITS_PER_LONG - 1;
+static QTAILQ_HEAD(, SeiContainer) pending_sei =
+QTAILQ_HEAD_INITIALIZER(pending_sei);
+static QTAILQ_HEAD(, S390PCIBusDevice) device_list =
+QTAILQ_HEAD_INITIALIZER(device_list);
+
+int chsc_sei_nt2_get_event(void *res)
+{
+ChscSeiNt2Res *nt2_res = (ChscSeiNt2Res *)res;
+PciCcdfAvail *accdf;
+PciCcdfErr *eccdf;
+int rc = 1;
+SeiContainer *sei_cont;
+
+sei_cont = QTAILQ_FIRST(pending_sei);
+if (sei_cont) {
+QTAILQ_REMOVE(pending_sei, sei_cont, link);
+nt2_res-nt = 2;
+nt2_res-cc = sei_cont-cc;
+switch (sei_cont-cc) {
+case 1: /* error event */
+eccdf = (PciCcdfErr *)nt2_res-ccdf;
+eccdf-fid = cpu_to_be32(sei_cont-fid);
+eccdf-fh = cpu_to_be32(sei_cont-fh);
+break;
+case 2: /* availability event */
+accdf = (PciCcdfAvail *)nt2_res-ccdf;
+accdf-fid = cpu_to_be32(sei_cont-fid);
+accdf-fh = cpu_to_be32(sei_cont-fh);
+accdf-pec = cpu_to_be16(sei_cont-pec);
+break;
+default:
+abort();
+}
+g_free(sei_cont);
+rc = 0;
+}
+
+return rc;
+}
+
+int chsc_sei_nt2_have_event(void)
+{
+return !QTAILQ_EMPTY(pending_sei);
+}
+
+static S390PCIBusDevice *s390_pci_find_dev_by_fid(uint32_t fid)
+{
+S390PCIBusDevice *pbdev;
+
+QTAILQ_FOREACH(pbdev, device_list, next) {
+if (pbdev-fid == fid) {
+return pbdev;
+}
+}
+return NULL;
+}
+
+void s390_pci_sclp_configure(int configure, SCCB *sccb)
+{
+PciCfgSccb *psccb = (PciCfgSccb *)sccb;
+S390PCIBusDevice *pbdev = 
s390_pci_find_dev_by_fid(be32_to_cpu(psccb-aid));
+uint16_t rc;
+
+if (pbdev) {
+if ((configure == 1  pbdev-configured == true) ||
+(configure == 0  pbdev-configured == false)) {
+rc = SCLP_RC_NO_ACTION_REQUIRED;
+} else {
+pbdev-configured = !pbdev-configured;
+rc = SCLP_RC_NORMAL_COMPLETION;
+}
+} else {
+DPRINTF(sclp config %d no dev found\n, configure);
+rc = 

[RFC patch 2/6] iommu: add iommu for s390 platform

2014-09-19 Thread frank . blaschka
From: Frank Blaschka frank.blasc...@de.ibm.com

Add a basic iommu for the s390 platform. The code is pretty
simple since on s390 each PCI device has its own virtual io address
space starting at the same vio address. For this a domain could
hold only one pci device. Also there is no relation between pci
devices so each device belongs to a separate iommu group.

Signed-off-by: Frank Blaschka frank.blasc...@de.ibm.com
---
 arch/s390/include/asm/pci.h |3 
 arch/s390/pci/pci_dma.c |   21 -
 drivers/iommu/Kconfig   |9 ++
 drivers/iommu/Makefile  |1 
 drivers/iommu/s390-iommu.c  |  181 
 5 files changed, 213 insertions(+), 2 deletions(-)

--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -177,6 +177,9 @@ struct zpci_dev *get_zdev_by_fid(u32);
 /* DMA */
 int zpci_dma_init(void);
 void zpci_dma_exit(void);
+int dma_update_trans(struct zpci_dev *zdev, unsigned long pa,
+dma_addr_t dma_addr, size_t size, int flags);
+void dma_purge_rto_entries(struct zpci_dev *zdev);
 
 /* FMB */
 int zpci_fmb_enable_device(struct zpci_dev *);
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -139,8 +139,8 @@ static void dma_update_cpu_trans(struct
entry_clr_protected(entry);
 }
 
-static int dma_update_trans(struct zpci_dev *zdev, unsigned long pa,
-   dma_addr_t dma_addr, size_t size, int flags)
+int dma_update_trans(struct zpci_dev *zdev, unsigned long pa,
+dma_addr_t dma_addr, size_t size, int flags)
 {
unsigned int nr_pages = PAGE_ALIGN(size)  PAGE_SHIFT;
u8 *page_addr = (u8 *) (pa  PAGE_MASK);
@@ -180,6 +180,7 @@ no_refresh:
spin_unlock_irqrestore(zdev-dma_table_lock, irq_flags);
return rc;
 }
+EXPORT_SYMBOL_GPL(dma_update_trans);
 
 static void dma_free_seg_table(unsigned long entry)
 {
@@ -210,6 +211,22 @@ static void dma_cleanup_tables(struct zp
zdev-dma_table = NULL;
 }
 
+void dma_purge_rto_entries(struct zpci_dev *zdev)
+{
+   unsigned long *table;
+   int rtx;
+
+   if (!zdev || !zdev-dma_table)
+   return;
+   table = zdev-dma_table;
+   for (rtx = 0; rtx  ZPCI_TABLE_ENTRIES; rtx++)
+   if (reg_entry_isvalid(table[rtx])) {
+   dma_free_seg_table(table[rtx]);
+   invalidate_table_entry(table[rtx]);
+   }
+}
+EXPORT_SYMBOL_GPL(dma_purge_rto_entries);
+
 static unsigned long __dma_alloc_iommu(struct zpci_dev *zdev,
   unsigned long start, int size)
 {
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -302,4 +302,13 @@ config ARM_SMMU
  Say Y here if your SoC includes an IOMMU device implementing
  the ARM SMMU architecture.
 
+config S390_IOMMU
+bool s390 IOMMU Support
+depends on S390
+select IOMMU_API
+help
+  Support for the IBM s/390 IOMMU
+
+  If unsure, say N here.
+
 endif # IOMMU_SUPPORT
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -19,3 +19,4 @@ obj-$(CONFIG_EXYNOS_IOMMU) += exynos-iom
 obj-$(CONFIG_SHMOBILE_IOMMU) += shmobile-iommu.o
 obj-$(CONFIG_SHMOBILE_IPMMU) += shmobile-ipmmu.o
 obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o
+obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
--- /dev/null
+++ b/drivers/iommu/s390-iommu.c
@@ -0,0 +1,181 @@
+#include linux/io.h
+#include linux/interrupt.h
+#include linux/platform_device.h
+#include linux/slab.h
+#include linux/pm_runtime.h
+#include linux/clk.h
+#include linux/err.h
+#include linux/mm.h
+#include linux/iommu.h
+#include linux/errno.h
+#include linux/list.h
+#include linux/memblock.h
+#include linux/export.h
+#include linux/pci.h
+#include linux/sizes.h
+#include asm/pci_dma.h
+
+#define S390_IOMMU_PGSIZES SZ_4K
+
+struct s390_domain {
+   struct zpci_dev *zdev;
+};
+
+static int s390_iommu_domain_init(struct iommu_domain *domain)
+{
+   struct s390_domain *priv;
+
+   priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+   if (!priv)
+   return -ENOMEM;
+
+   domain-priv = priv;
+   return 0;
+}
+
+static void s390_iommu_domain_destroy(struct iommu_domain *domain)
+{
+   kfree(domain-priv);
+   domain-priv = NULL;
+}
+
+static int s390_iommu_attach_device(struct iommu_domain *domain,
+   struct device *dev)
+{
+   struct s390_domain *priv = domain-priv;
+
+   if (priv-zdev)
+   return -EEXIST;
+
+   priv-zdev = (struct zpci_dev *)to_pci_dev(dev)-sysdata;
+   return 0;
+}
+
+static void s390_iommu_detach_device(struct iommu_domain *domain,
+struct device *dev)
+{
+   struct s390_domain *priv = domain-priv;
+
+   dma_purge_rto_entries(priv-zdev);
+   priv-zdev = NULL;
+}
+
+static int s390_iommu_map(struct iommu_domain *domain, unsigned long iova,
+ 

[RFC patch 1/6] KVM: s390: Enable PCI instructions

2014-09-19 Thread frank . blaschka
Enable PCI instructions for s390 KVM.

Signed-off-by: Frank Blaschka frank.blasc...@de.ibm.com
---
 arch/s390/kvm/kvm-s390.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1787,7 +1787,7 @@ static int __init kvm_s390_init(void)
}
memcpy(vfacilities, S390_lowcore.stfle_fac_list, 16);
vfacilities[0] = 0xff82fff3f4fc2000UL;
-   vfacilities[1] = 0x005cUL;
+   vfacilities[1] = 0x07dcUL;
return 0;
 }
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RESEND PATCH 1/3] x86: Adding structs to reflect cpuid fields

2014-09-19 Thread Borislav Petkov
On Fri, Sep 19, 2014 at 11:59:32AM +0300, Nadav Amit wrote:
 As for the concern that CPUID fields would be extended into non-zero
 reserved bits - can someone point me to a single case that occurred in
 the last 20 years during which CPUID existed?

Do you have a guarantee that this won't happen in the future and break
all your fancy bitfields assumptions?

-- 
Regards/Gruss,
Boris.
--
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 2/2] KVM: x86: directly use kvm_make_request again

2014-09-19 Thread Radim Krčmář
2014-09-19 14:12+0800, Xiao Guangrong:
 On 09/19/2014 12:38 AM, Liang Chen wrote:
  A one-line wrapper around kvm_make_request does not seem
  particularly useful. Replace kvm_mmu_flush_tlb() with
  kvm_make_request() again to free the namespace a bit.
  
  Signed-off-by: Liang Chen liangchen.li...@gmail.com
  ---
   arch/x86/include/asm/kvm_host.h |  1 -
   arch/x86/kvm/mmu.c  | 16 +---
   arch/x86/kvm/vmx.c  |  2 +-
   arch/x86/kvm/x86.c  | 11 ---
   4 files changed, 14 insertions(+), 16 deletions(-)
  
  diff --git a/arch/x86/include/asm/kvm_host.h 
  b/arch/x86/include/asm/kvm_host.h
  index 7c492ed..77ade89 100644
  --- a/arch/x86/include/asm/kvm_host.h
  +++ b/arch/x86/include/asm/kvm_host.h
  @@ -917,7 +917,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu);
  
   int fx_init(struct kvm_vcpu *vcpu);
  
  -void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
   void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 const u8 *new, int bytes);
   int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
  diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
  index b41fd97..acc2d0c5 100644
  --- a/arch/x86/kvm/mmu.c
  +++ b/arch/x86/kvm/mmu.c
  @@ -1749,7 +1749,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, 
  struct kvm_mmu_page *sp,
  return 1;
  }
  
  -   kvm_mmu_flush_tlb(vcpu);
  +   kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
  return 0;
   }
  
  @@ -1802,7 +1802,7 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu,  
  gfn_t gfn)
  
  kvm_mmu_commit_zap_page(vcpu-kvm, invalid_list);
  if (flush)
  -   kvm_mmu_flush_tlb(vcpu);
  +   kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
   }
  
   struct mmu_page_path {
  @@ -2536,7 +2536,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 
  *sptep,
true, host_writable)) {
  if (write_fault)
  *emulate = 1;
  -   kvm_mmu_flush_tlb(vcpu);
  +   kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
  }
  
  if (unlikely(is_mmio_spte(*sptep)  emulate))
  @@ -3450,12 +3450,6 @@ static void nonpaging_init_context(struct kvm_vcpu 
  *vcpu,
  context-nx = false;
   }
  
  -void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
  -{
  -   kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
  -}
  -EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
  -
   void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu)
   {
  mmu_free_roots(vcpu);
  @@ -3961,7 +3955,7 @@ static void mmu_pte_write_flush_tlb(struct kvm_vcpu 
  *vcpu, bool zap_page,
  if (remote_flush)
  kvm_flush_remote_tlbs(vcpu-kvm);
  else if (local_flush)
  -   kvm_mmu_flush_tlb(vcpu);
  +   kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
   }
  
   static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
  @@ -4222,7 +4216,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
   void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
   {
  vcpu-arch.mmu.invlpg(vcpu, gva);
  -   kvm_mmu_flush_tlb(vcpu);
  +   kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
  ++vcpu-stat.invlpg;
   }
   EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
  diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
  index bfe11cf..bb0a7ab 100644
  --- a/arch/x86/kvm/vmx.c
  +++ b/arch/x86/kvm/vmx.c
  @@ -6617,7 +6617,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
  switch (type) {
  case VMX_EPT_EXTENT_GLOBAL:
  kvm_mmu_sync_roots(vcpu);
  -   kvm_mmu_flush_tlb(vcpu);
  +   kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
  nested_vmx_succeed(vcpu);
  break;
  default:
  diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
  index 9eb5458..fc3df50 100644
  --- a/arch/x86/kvm/x86.c
  +++ b/arch/x86/kvm/x86.c
  @@ -726,7 +726,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long 
  cr3)
   {
  if (cr3 == kvm_read_cr3(vcpu)  !pdptrs_changed(vcpu)) {
  kvm_mmu_sync_roots(vcpu);
  -   kvm_mmu_flush_tlb(vcpu);
  +   kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
  return 0;
  }
  
  @@ -5989,6 +5989,12 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
  kvm_apic_update_tmr(vcpu, tmr);
   }
  
  +static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
  +{
  +   ++vcpu-stat.tlb_flush;
  +   kvm_x86_ops-tlb_flush(vcpu);
  +}
  +
   /*
* Returns 1 to let __vcpu_run() continue the guest execution loop without
* exiting to the userspace.  Otherwise, the value will be returned to the
  @@ -6018,8 +6024,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
  if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
  kvm_mmu_sync_roots(vcpu);
  if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
  -   ++vcpu-stat.tlb_flush;
  -   kvm_x86_ops-tlb_flush(vcpu);
  +   kvm_vcpu_flush_tlb(vcpu);
 
 NACK!
 
 Do not understand why you have to introduce a meaningful name
 here - 

Re: [PATCH] blk-mq: Avoid race condition with uninitialized requests

2014-09-19 Thread Christian Borntraeger
On 09/18/2014 11:04 AM, David Hildenbrand wrote:
 This patch should fix the bug reported in https://lkml.org/lkml/2014/9/11/249.
 
 We have to initialize at least the atomic_flags and the cmd_flags when
 allocating storage for the requests.
 
 Otherwise blk_mq_timeout_check() might dereference uninitialized pointers when
 racing with the creation of a request.
 
 Also move the reset of cmd_flags for the initializing code to the point where 
 a
 request is freed. So we will never end up with pending flush request 
 indicators
 that might trigger dereferences of invalid pointers in blk_mq_timeout_check().
 
 Cc: sta...@vger.kernel.org
 Signed-off-by: David Hildenbrand d...@linux.vnet.ibm.com

So far our testers were not able to reproduce the problem with this patch 
applied, we will continue testing over weekend. I will give an ACK/NACK on 
monday then.

Thanks


 ---
  block/blk-mq.c | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)
 
 diff --git a/block/blk-mq.c b/block/blk-mq.c
 index 383ea0c..eed6340 100644
 --- a/block/blk-mq.c
 +++ b/block/blk-mq.c
 @@ -203,7 +203,6 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, 
 int rw)
   if (tag != BLK_MQ_TAG_FAIL) {
   rq = data-hctx-tags-rqs[tag];
 
 - rq-cmd_flags = 0;
   if (blk_mq_tag_busy(data-hctx)) {
   rq-cmd_flags = REQ_MQ_INFLIGHT;
   atomic_inc(data-hctx-nr_active);
 @@ -258,6 +257,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx 
 *hctx,
 
   if (rq-cmd_flags  REQ_MQ_INFLIGHT)
   atomic_dec(hctx-nr_active);
 + rq-cmd_flags = 0;
 
   clear_bit(REQ_ATOM_STARTED, rq-atomic_flags);
   blk_mq_put_tag(hctx, tag, ctx-last_tag);
 @@ -1404,6 +1404,8 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct 
 blk_mq_tag_set *set,
   left -= to_do * rq_size;
   for (j = 0; j  to_do; j++) {
   tags-rqs[i] = p;
 + tags-rqs[i]-atomic_flags = 0;
 + tags-rqs[i]-cmd_flags = 0;
   if (set-ops-init_request) {
   if (set-ops-init_request(set-driver_data,
   tags-rqs[i], hctx_idx, i,
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM: nested VMX: disable perf cpuid reporting

2014-09-19 Thread Paolo Bonzini
Il 18/09/2014 23:24, Marcelo Tosatti ha scritto:
 
 Initilization of L2 guest with -cpu host, on L1 guest with -cpu host 
 triggers:
 
 (qemu) KVM: entry failed, hardware error 0x7
 ...
 nested_vmx_run: VMCS MSR_{LOAD,STORE} unsupported
 
 Nested VMX MSR load/store support is not sufficient to 
 allow perf for L2 guest.
 
 Until properly fixed, trap CPUID and disable function 0xA.
 
 Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
 
 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
 index e28d798..976e3a5 100644
 --- a/arch/x86/kvm/cpuid.c
 +++ b/arch/x86/kvm/cpuid.c
 @@ -774,6 +774,12 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 
 *ebx, u32 *ecx, u32 *edx)
   if (!best)
   best = check_cpuid_limit(vcpu, function, index);
  
 + /*
 +  * Perfmon not yet supported for L2 guest.
 +  */
 + if (is_guest_mode(vcpu)  function == 0xa)
 + best = NULL;
 +
   if (best) {
   *eax = best-eax;
   *ebx = best-ebx;
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index e8f08e9..2ab5047 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -6986,6 +6986,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu 
 *vcpu)
   case EXIT_REASON_TASK_SWITCH:
   return 1;
   case EXIT_REASON_CPUID:
 + if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa)
 + return 0;
   return 1;
   case EXIT_REASON_HLT:
   return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
 

Nice solution. :)

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/1] KVM: correct null pid check in kvm_vcpu_yield_to()

2014-09-19 Thread Paolo Bonzini
Il 19/09/2014 01:40, Sam Bobroff ha scritto:
 Correct a simple mistake of checking the wrong variable
 before a dereference, resulting in the dereference not being
 properly protected by rcu_dereference().
 
 Signed-off-by: Sam Bobroff sam.bobr...@au1.ibm.com
 ---
 
  virt/kvm/kvm_main.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 index 33712fb..688acb0 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -1725,7 +1725,7 @@ int kvm_vcpu_yield_to(struct kvm_vcpu *target)
   rcu_read_lock();
   pid = rcu_dereference(target-pid);
   if (pid)
 - task = get_pid_task(target-pid, PIDTYPE_PID);
 + task = get_pid_task(pid, PIDTYPE_PID);
   rcu_read_unlock();
   if (!task)
   return ret;
 

Thanks, applying to kvm/master.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [question] virtio-blk performance degradationhappened with virito-serial

2014-09-19 Thread Paolo Bonzini
Il 19/09/2014 07:53, Fam Zheng ha scritto:
 Any ideas?

The obvious, but hardish one is to switch to epoll (one epoll fd per
AioContext, plus one for iohandler.c).

This would require converting iohandler.c to a GSource.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 2/2] KVM: x86: directly use kvm_make_request again

2014-09-19 Thread Xiao Guangrong
On 09/19/2014 08:25 PM, Radim Krčmář wrote:

   * Returns 1 to let __vcpu_run() continue the guest execution loop without
   * exiting to the userspace.  Otherwise, the value will be returned to the
 @@ -6018,8 +6024,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
 kvm_mmu_sync_roots(vcpu);
 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
 -   ++vcpu-stat.tlb_flush;
 -   kvm_x86_ops-tlb_flush(vcpu);
 +   kvm_vcpu_flush_tlb(vcpu);

 NACK!

 Do not understand why you have to introduce a meaningful name
 here - it's used just inner a function, which can not help to
 improve a readability of the code at all.
 
 I prefer the new hunk
  - it makes the parent function simpler (not everyone wants to read how
we do tlb flushes when looking at vcpu_enter_guest)

Using one line instead of two lines does not simplify parent function much.

  - the function is properly named

kvm_x86_ops-tlb_flush(vcpu) is also a good hit to tell the reader it is
doing tlb flush. :)

  - we do a similar thing with kvm_gen_kvmclock_update

I understand this raw-bit-set style is largely used in current kvm code,
however, it does not mean it's a best way do it. It may be turned off
someday as it is be used in more and more places.

Anyway, the meaningful name wrapping raw-bit-set is a right direction
and let's keep this right direction.

 
 What i suggested is renaming kvm_mmu_flush_tlb() since it's a
 API used in multiple files - a good name helps developer to
 know what it's doing and definitely easier typing.
 
 I think it is a good idea.
 The proposed name is definitely better than what we have now.
 
 You can see reasons that led me to prefer raw request below.
 (Preferring something else is no way means that I'm against your idea.)

I understand that, Radim! :)

 
 ---
 I'm always trying to reach some ideal code in my mind, which makes me
 seemingly oppose good proposals because I see how it could be even
 better ...  and I opt not to do them.
 (Pushing minor refactoring patches upstream is hard!)
 
 My issues with kvm_mmu_flush_tlb:
 
  - 'kvm_flush_remote_tlbs()' calls tlb request directly;
 our wrapper thus cannot be extended with features, which makes it a
 poor abstraction

kvm_flush_remote_tlbs does not only set tlb request but also handles memory
order and syncs the tlb state.

I guess you wanted to say kvm_mmu_flush_tlb here, it is a API name and let
it be easily used in other files. It's not worth committing a patch doing
nothing except reverting the meaningful name.

  - we don't do this for other requests

See above.

  - direct request isn't absolutely horrible to read and write
(I totally agree that it is bad.)
  - we call one function 'kvm_mmu_flush_tlb()' and the second one
'kvm_flush_remote_tlbs()' and I'd need to look why

Yeah, this is why i suggested to rename kvm_mmu_flush_tlb since which clarifies
things better:
- kvm_flush_remote_tlbs: flush tlb in all vcpus
- kvm_vcpu_flush_tlb: only flush tlb on the vcpu specified by @vcpu.

 
 Which is why just removing it solves more problems for me :)

Thank you for raising this question and letting me know the patch's history. :)



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RESEND PATCH 1/3] x86: Adding structs to reflect cpuid fields

2014-09-19 Thread Paolo Bonzini
Il 19/09/2014 09:58, Borislav Petkov ha scritto:
  The trivial example is feature bits like XSAVE. We query them all the
  time without checking the family when they were first introduced,
  don't we?
 The feature bits would obviously be 0 if features are not supported.

And similarly, Intel would not extend a bit from 16 to 17 bits if it
weren't zero on all older processors.

 However, even there
 
 16 - Reserved - Do not count on the value.
 
 I'm quoting Intel's CPUID doc 241618-037 from 2011 (there might be a
 newer one though), the CPUID(1).ECX description.

Once that bit gets a meaning in newer processors, the same meaning will
work retroactively for existing processors.  That's just how CPUID is
used.  Nobody checks families before testing bits, Intel/AMD do not even
suggest that.

 Do you have a guarantee that this won't happen in the future and break
 all your fancy bitfields assumptions?

No guarantee, but were that to happen, I'd expect tar and feathers
spectacles around Intel's engineering headquarters.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 2/2] KVM: x86: directly use kvm_make_request again

2014-09-19 Thread Paolo Bonzini
Il 19/09/2014 15:35, Xiao Guangrong ha scritto:
  Which is why just removing it solves more problems for me :)
 Thank you for raising this question and letting me know the patch's history. 
 :)

I agree with Radim, so I'll apply the patch.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 2/2] KVM: x86: directly use kvm_make_request again

2014-09-19 Thread Liang Chen

On 09/19/2014 02:12 AM, Xiao Guangrong wrote:
 On 09/19/2014 12:38 AM, Liang Chen wrote:
 A one-line wrapper around kvm_make_request does not seem
 particularly useful. Replace kvm_mmu_flush_tlb() with
 kvm_make_request() again to free the namespace a bit.

 Signed-off-by: Liang Chen liangchen.li...@gmail.com
 ---
  arch/x86/include/asm/kvm_host.h |  1 -
  arch/x86/kvm/mmu.c  | 16 +---
  arch/x86/kvm/vmx.c  |  2 +-
  arch/x86/kvm/x86.c  | 11 ---
  4 files changed, 14 insertions(+), 16 deletions(-)

 diff --git a/arch/x86/include/asm/kvm_host.h 
 b/arch/x86/include/asm/kvm_host.h
 index 7c492ed..77ade89 100644
 --- a/arch/x86/include/asm/kvm_host.h
 +++ b/arch/x86/include/asm/kvm_host.h
 @@ -917,7 +917,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu);

  int fx_init(struct kvm_vcpu *vcpu);

 -void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
  void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 const u8 *new, int bytes);
  int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
 diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
 index b41fd97..acc2d0c5 100644
 --- a/arch/x86/kvm/mmu.c
 +++ b/arch/x86/kvm/mmu.c
 @@ -1749,7 +1749,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, 
 struct kvm_mmu_page *sp,
  return 1;
  }

 -kvm_mmu_flush_tlb(vcpu);
 +kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
  return 0;
  }

 @@ -1802,7 +1802,7 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu,  
 gfn_t gfn)

  kvm_mmu_commit_zap_page(vcpu-kvm, invalid_list);
  if (flush)
 -kvm_mmu_flush_tlb(vcpu);
 +kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
  }

  struct mmu_page_path {
 @@ -2536,7 +2536,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 
 *sptep,
true, host_writable)) {
  if (write_fault)
  *emulate = 1;
 -kvm_mmu_flush_tlb(vcpu);
 +kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
  }

  if (unlikely(is_mmio_spte(*sptep)  emulate))
 @@ -3450,12 +3450,6 @@ static void nonpaging_init_context(struct kvm_vcpu 
 *vcpu,
  context-nx = false;
  }

 -void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 -{
 -kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 -}
 -EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
 -
  void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu)
  {
  mmu_free_roots(vcpu);
 @@ -3961,7 +3955,7 @@ static void mmu_pte_write_flush_tlb(struct kvm_vcpu 
 *vcpu, bool zap_page,
  if (remote_flush)
  kvm_flush_remote_tlbs(vcpu-kvm);
  else if (local_flush)
 -kvm_mmu_flush_tlb(vcpu);
 +kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
  }

  static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
 @@ -4222,7 +4216,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
  void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
  {
  vcpu-arch.mmu.invlpg(vcpu, gva);
 -kvm_mmu_flush_tlb(vcpu);
 +kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
  ++vcpu-stat.invlpg;
  }
  EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index bfe11cf..bb0a7ab 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -6617,7 +6617,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
  switch (type) {
  case VMX_EPT_EXTENT_GLOBAL:
  kvm_mmu_sync_roots(vcpu);
 -kvm_mmu_flush_tlb(vcpu);
 +kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
  nested_vmx_succeed(vcpu);
  break;
  default:
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index 9eb5458..fc3df50 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -726,7 +726,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
  {
  if (cr3 == kvm_read_cr3(vcpu)  !pdptrs_changed(vcpu)) {
  kvm_mmu_sync_roots(vcpu);
 -kvm_mmu_flush_tlb(vcpu);
 +kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
  return 0;
  }

 @@ -5989,6 +5989,12 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
  kvm_apic_update_tmr(vcpu, tmr);
  }

 +static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
 +{
 +++vcpu-stat.tlb_flush;
 +kvm_x86_ops-tlb_flush(vcpu);
 +}
 +
  /*
   * Returns 1 to let __vcpu_run() continue the guest execution loop without
   * exiting to the userspace.  Otherwise, the value will be returned to the
 @@ -6018,8 +6024,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
  if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
  kvm_mmu_sync_roots(vcpu);
  if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
 -++vcpu-stat.tlb_flush;
 -kvm_x86_ops-tlb_flush(vcpu);
 +kvm_vcpu_flush_tlb(vcpu);
 NACK!

 Do not understand why you have to introduce a meaningful name
 here - it's used just inner a function, which can not help to
 improve a readability of the code at all.


Re: [Qemu-devel] [INVITE] OVMF BoF session at the KVM Forum 2014

2014-09-19 Thread Ard Biesheuvel
On 18 September 2014 05:18, Laszlo Ersek ler...@redhat.com wrote:
 On 09/18/14 13:44, Andreas Färber wrote:
 Hello Laszlo,

 Am 18.09.2014 um 10:23 schrieb Laszlo Ersek:
 I've been made an offer that I couldn't refuse :) to organize a Birds
 of a Feather session concerning OVMF at the KVM Forum 2014.

 Interested people, please sign up:

   http://www.linux-kvm.org/page/KVM_Forum_2014_BOF#OVMF

 Nice idea. Your summary mentions only ia32 and x86_64 - I would be
 interested in an update on OVMF for AArch64 - there seemed to already be
 support for ARM's Foundation Model but not yet for QEMU.

 We've successfully UEFI-booted
 - GNU/Linux guest(s) on
 - upstream edk2 (*) and
 - upstream qemu-system-aarch64 with
   - TCG on x86_64 host,
   - KVM on aarch64 host (**)

 (*) Ard's patches for upstream edk2 are in the process of being tested /
 merged.


They have been merged as of yesterday.

 (**) Ard's patches for the upstream host kernel (== KVM) have been...
 ugh, not sure... applied to a maintainer tree? Ard? :)


Some are in kvm/master, which I think means to should go into the next
3.17-rc, although I haven't seen much movement there.
Some other bits are being taken through the kvmarm/queue branch, so
they should arrive by 3.18-rc1

 So, it works (as far as I tested it myself on TCG, and heard reports
 about it on KVM), but right now you need to apply a handful of pending
 patches manually.

 We can certainly talk about Aarch64 at the BoF, but then Ard should
 co-organize. No good deed goes unpunished, as ever! :)


I do appreciate the suggestion, but I don't think it will be feasible
for me to come to Duesseldorf.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[[RFC] KVM-S390: Provide guest TOD Clock Get/Set Controls

2014-09-19 Thread Jason J. Herne
From: Jason J. Herne jjhe...@us.ibm.com

Enable KVM_SET_CLOCK and KVM_GET_CLOCK Ioctls on S390 for managing guest TOD
clock value.

we add the KVM_CLOCK_FORWARD_ONLY flag to indicate to KVM_SET_CLOCK that the
given clock value should only be set if it is = the current guest TOD clock
value. This guarantees a monotonically increasing time.

NOTE: In the event that the KVM_CLOCK_FORWARD_ONLY flag is set and the given
time would cause the guest time to jump backward, then we set the guest TOD
clock equal to the host TOD clock. Does this behavior make sense, or is it too
weird? I could believe that other architectures might not want this exact
behavior. Instead they might prefer to implement the function such that an
error code is returned instead of syncing the guest time to host time? In that
case S390 would need another bit KVM_CLOCK_SET_TO_HOST which we could call to
sync host time when the preferred guest time value would otherwise violate
the monotonic property of the KVM_CLOCK_FORWARD_ONLY flag.

Signed-off-by: Jason J. Herne jjhe...@us.ibm.com
---
 Documentation/virtual/kvm/api.txt |  5 +++
 arch/s390/kvm/kvm-s390.c  | 80 +++
 include/uapi/linux/kvm.h  |  3 ++
 3 files changed, 88 insertions(+)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index beae3fd..615c2e4 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -779,6 +779,11 @@ struct kvm_clock_data {
__u32 pad[9];
 };
 
+S390: KVM_CLOCK_FORWARD_ONLY is used by KVM_SET_CLOCK to indicate that the 
guest
+TOD clock should not be allowed to jump back in time. This flag guarantees a
+monotonically increasing guest clock. If the clock value specified would cause
+the guest to jump back in time then the guest TOD clock is set to the host
+TOD clock value.
 
 4.31 KVM_GET_VCPU_EVENTS
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 81b0e11..2450db3 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -31,6 +31,7 @@
 #include asm/switch_to.h
 #include asm/facility.h
 #include asm/sclp.h
+#includeasm/timex.h
 #include kvm-s390.h
 #include gaccess.h
 
@@ -169,6 +170,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_IRQCHIP:
case KVM_CAP_VM_ATTRIBUTES:
case KVM_CAP_MP_STATE:
+   case KVM_CAP_ADJUST_CLOCK:
r = 1;
break;
case KVM_CAP_NR_VCPUS:
@@ -338,6 +340,63 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct 
kvm_device_attr *attr)
return ret;
 }
 
+static int kvm_s390_get_guest_tod(struct kvm *kvm, struct kvm_clock_data 
*user_clock)
+{
+   u64 current_host_tod;
+   u64 epoch = 0;
+   struct kvm_vcpu *vcpu;
+   unsigned int vcpu_idx;
+   int r;
+
+   /* All vcpu's epochs are in sync. Just Grab the 1st one */
+   kvm_for_each_vcpu(vcpu_idx, vcpu, kvm)
+   {
+   epoch = vcpu-arch.sie_block-epoch;
+   break;
+   }
+
+   r = store_tod_clock(current_host_tod);
+   if (r)
+   return r;
+
+   user_clock-clock = current_host_tod + epoch;
+   return 0;
+}
+
+/*
+Set the guest's effective TOD clock to the given value. The guest's
+TOD clock is determined by the following formula: gtod = host_tod + epoch.
+NOTE: Even though the epoch value is associated with a vcpu, there is only
+one TOD clock and epoch value per guest.  All vcpu's epoch values must be kept
+synchronized.
+NOTE: The KVM_CLOCK_FORWARD_ONLY flag is used to indicate that the guest clock
+should only be set to the provided value if doing so does not cause guest time
+to jump backwards. In this case we zero the epoch thereby making the guest TOD
+clock equal to the host TOD clock.
+*/
+static int kvm_s390_set_guest_tod(struct kvm *kvm, struct kvm_clock_data 
*user_clock)
+{
+   u64 current_host_tod, epoch;
+   struct kvm_vcpu *vcpu;
+   unsigned int vcpu_idx;
+   int r;
+
+   r = store_tod_clock(current_host_tod);
+   if (r)
+   return r;
+
+if ((user_clock-flags  KVM_CLOCK_FORWARD_ONLY) 
+(current_host_tod  user_clock-clock))
+epoch = 0;
+else
+   epoch = user_clock-clock - current_host_tod;
+
+   kvm_for_each_vcpu(vcpu_idx, vcpu, kvm)
+   vcpu-arch.sie_block-epoch = epoch;
+
+   return 0;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
   unsigned int ioctl, unsigned long arg)
 {
@@ -397,6 +456,27 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = kvm_s390_vm_has_attr(kvm, attr);
break;
}
+   case KVM_GET_CLOCK: {
+   struct kvm_clock_data user_clock;
+
+   r = kvm_s390_get_guest_tod(kvm, user_clock);
+   if (r)
+   break;
+
+   if (copy_to_user(argp, user_clock, sizeof(user_clock)))
+   r = 

Re: [PATCH v3 2/2] KVM: x86: directly use kvm_make_request again

2014-09-19 Thread Liang Chen
oops, didn't see this ;)   Thank you!

Thanks,
Liang

On 09/19/2014 10:00 AM, Paolo Bonzini wrote:
 Il 19/09/2014 15:35, Xiao Guangrong ha scritto:
 Which is why just removing it solves more problems for me :)
 Thank you for raising this question and letting me know the patch's history. 
 :)
 I agree with Radim, so I'll apply the patch.

 Paolo

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RESEND PATCH 1/3] x86: Adding structs to reflect cpuid fields

2014-09-19 Thread Borislav Petkov
On Fri, Sep 19, 2014 at 03:40:29PM +0200, Paolo Bonzini wrote:
 And similarly, Intel would not extend a bit from 16 to 17 bits if it
 weren't zero on all older processors.

Nothing is stopping a hw designer to say we're using 17 bits now. And
software needs to deal with that. It is that simple!

  However, even there
  
  16 - Reserved - Do not count on the value.
  
  I'm quoting Intel's CPUID doc 241618-037 from 2011 (there might be a
  newer one though), the CPUID(1).ECX description.
 
 Once that bit gets a meaning in newer processors, the same meaning will
 work retroactively for existing processors.  That's just how CPUID is
 used.  Nobody checks families before testing bits, Intel/AMD do not even
 suggest that.

I know that, and I bet those bits won't even get reused but we don't
know, do we?! All I'm pointing at is that even the feature bits which
are reserved cannot be relied to be of any value.

In any case, the moment hw designers decide to change any field width
for which you have an union defined, this fragile scheme breaks with you
needing to introduce ifdeffery.

And frankly, this whole waste-time discussion is only about you guys
wanting to use those bitfields so that constructing a CPUID value works
more conveniently. (Reportedly, I'm not convinced of that at all).

While the gazillion other places in the kernel simply use logical
operations to construct a value and write it into the corresponding
register. Does that mean that they have to go and define unions for
those registers too? Of course not! That would be insane.

I remember a time where we did remove exactly that type of bitfields
from the ide code because it wasn't helping, it was counter-intuitive
and needless.

Geez, can we drop this already! It is a dumb idea, it doesn't bring us
anything besides some superficial readability which you don't really
need. Nowadays you can use even goddam wikipedia to understand what the
CPUID bits mean:

https://en.wikipedia.org/wiki/CPUID

Everytime we have to access registers, we need to open the corresponding
manual (or wikipedia, alternatively :-)) and look at the bit
definitions. This will not change.

  Do you have a guarantee that this won't happen in the future and break
  all your fancy bitfields assumptions?
 
 No guarantee, but were that to happen, I'd expect tar and feathers
 spectacles around Intel's engineering headquarters.

You're going over and doing some fireworks there?

:-P

-- 
Regards/Gruss,
Boris.
--
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [RFC patch 5/6] s390: implement pci instruction

2014-09-19 Thread Thomas Huth

 Hi Frank,

On Fri, 19 Sep 2014 13:54:34 +0200
frank.blasc...@de.ibm.com wrote:

 From: Frank Blaschka frank.blasc...@de.ibm.com
 
 This patch implements the s390 pci instructions in qemu. This allows
 to attach qemu pci devices including vfio. This does not mean the
 devices are functional but at least detection and config/memory space
 access is working.
 
 Signed-off-by: Frank Blaschka frank.blasc...@de.ibm.com
 ---
  target-s390x/Makefile.objs |2 
  target-s390x/kvm.c |   52 +++
  target-s390x/pci_ic.c  |  621 
 +
  target-s390x/pci_ic.h  |  425 ++
  4 files changed, 1099 insertions(+), 1 deletion(-)
 
 --- a/target-s390x/Makefile.objs
 +++ b/target-s390x/Makefile.objs
 @@ -2,4 +2,4 @@ obj-y += translate.o helper.o cpu.o inte
  obj-y += int_helper.o fpu_helper.o cc_helper.o mem_helper.o misc_helper.o
  obj-y += gdbstub.o
  obj-$(CONFIG_SOFTMMU) += ioinst.o arch_dump.o
 -obj-$(CONFIG_KVM) += kvm.o
 +obj-$(CONFIG_KVM) += kvm.o pci_ic.o
 --- a/target-s390x/kvm.c
 +++ b/target-s390x/kvm.c
 @@ -40,6 +40,7 @@
  #include exec/gdbstub.h
  #include trace.h
  #include qapi-event.h
 +#include pci_ic.h
 
  /* #define DEBUG_KVM */
 
 @@ -56,6 +57,7 @@
  #define IPA0_B2 0xb200
  #define IPA0_B9 0xb900
  #define IPA0_EB 0xeb00
 +#define IPA0_E3 0xe300
 
  #define PRIV_B2_SCLP_CALL   0x20
  #define PRIV_B2_CSCH0x30
 @@ -76,8 +78,17 @@
  #define PRIV_B2_XSCH0x76
 
  #define PRIV_EB_SQBS0x8a
 +#define PRIV_EB_PCISTB  0xd0
 +#define PRIV_EB_SIC 0xd1
 
  #define PRIV_B9_EQBS0x9c
 +#define PRIV_B9_CLP 0xa0
 +#define PRIV_B9_PCISTG  0xd0
 +#define PRIV_B9_PCILG   0xd2
 +#define PRIV_B9_RPCIT   0xd3
 +
 +#define PRIV_E3_MPCIFC  0xd0
 +#define PRIV_E3_STPCIFC 0xd4
 
  #define DIAG_IPL0x308
  #define DIAG_KVM_HYPERCALL  0x500
 @@ -813,6 +824,18 @@ static int handle_b9(S390CPU *cpu, struc
  int r = 0;
 
  switch (ipa1) {
 +case PRIV_B9_CLP:
 +r = kvm_clp_service_call(cpu, run);
 +break;
 +case PRIV_B9_PCISTG:
 +r = kvm_pcistg_service_call(cpu, run);
 +break;
 +case PRIV_B9_PCILG:
 +r = kvm_pcilg_service_call(cpu, run);
 +break;
 +case PRIV_B9_RPCIT:
 +r = kvm_rpcit_service_call(cpu, run);
 +break;
  case PRIV_B9_EQBS:
  /* just inject exception */
  r = -1;
 @@ -831,6 +854,12 @@ static int handle_eb(S390CPU *cpu, struc
  int r = 0;
 
  switch (ipa1) {
 +case PRIV_EB_PCISTB:
 +r = kvm_pcistb_service_call(cpu, run);
 +break;
 +case PRIV_EB_SIC:
 +r = kvm_sic_service_call(cpu, run);
 +break;
  case PRIV_EB_SQBS:
  /* just inject exception */
  r = -1;

I'm not sure, but I think the handler for the eb instructions is wrong:
The second byte of the opcode is encoded in the lowest byte of the ipb
field, not the lowest byte of the ipa field (just like with the e3
handler). Did you verify that your handlers get called correctly?

 @@ -844,6 +873,26 @@ static int handle_eb(S390CPU *cpu, struc
  return r;
  }
 
 +static int handle_e3(S390CPU *cpu, struct kvm_run *run, uint8_t ipa1)
 +{
 +int r = 0;
 +
 +switch (ipa1) {
 +case PRIV_E3_MPCIFC:
 +r = kvm_mpcifc_service_call(cpu, run);
 +break;
 +case PRIV_E3_STPCIFC:
 +r = kvm_stpcifc_service_call(cpu, run);
 +break;
 +default:
 +r = -1;
 +DPRINTF(KVM: unhandled PRIV: 0xe3%x\n, ipa1);
 +break;
 +}
 +
 +return r;
 +}

Could you please replace ipa1 with ipb1 to avoid confusion here?

  static int handle_hypercall(S390CPU *cpu, struct kvm_run *run)
  {
  CPUS390XState *env = cpu-env;
 @@ -1038,6 +1087,9 @@ static int handle_instruction(S390CPU *c
  case IPA0_EB:
  r = handle_eb(cpu, run, ipa1);
  break;
 +case IPA0_E3:
 +r = handle_e3(cpu, run, run-s390_sieic.ipb  0xff);
 +break;
  case IPA0_DIAG:
  r = handle_diag(cpu, run, run-s390_sieic.ipb);
  break;
 --- /dev/null
 +++ b/target-s390x/pci_ic.c
 @@ -0,0 +1,621 @@
[...]
 +
 +int kvm_pcilg_service_call(S390CPU *cpu, struct kvm_run *run)
 +{
 +CPUS390XState *env = cpu-env;
 +S390PCIBusDevice *pbdev;
 +uint8_t r1 = (run-s390_sieic.ipb  0x00f0)  20;
 +uint8_t r2 = (run-s390_sieic.ipb  0x000f)  16;
 +PciLgStg *rp;
 +uint64_t offset;
 +uint64_t data;
 +
 +cpu_synchronize_state(CPU(cpu));
 +rp = (PciLgStg *)env-regs[r2];

I think you have to check for r2 to be even here (and inject a
specification exception otherwise)

You should also 

Re: [PATCH v6] arm64: fix VTTBR_BADDR_MASK

2014-09-19 Thread Catalin Marinas
On Tue, Sep 09, 2014 at 12:08:52AM +0100, Joel Schopp wrote:
 The current VTTBR_BADDR_MASK only masks 39 bits, which is broken on current
 systems.  Rather than just add a bit it seems like a good time to also set
 things at run-time instead of compile time to accomodate more hardware.
 
 This patch sets TCR_EL2.PS, VTCR_EL2.T0SZ and vttbr_baddr_mask in runtime,
 not compile time.
 
 In ARMv8, EL2 physical address size (TCR_EL2.PS) and stage2 input address
 size (VTCR_EL2.T0SZE) cannot be determined in compile time since they
 depend on hardware capability.
 
 According to Table D4-23 and Table D4-25 in ARM DDI 0487A.b document,
 vttbr_x is calculated using different fixed values with consideration
 of T0SZ, granule size and the level of translation tables. Therefore,
 vttbr_baddr_mask should be determined dynamically.

So I agree with vttbr_baddr_mask being determined dynamically. I also
agree with setting TCR_EL2.PS at run-time but the VTCR_EL2.T0SZ
determines the input of the stage 2 translation. That's a KVM
configuration about what IPA size it provides to the guests (and
platform model it intends to support) and it doesn't need to be the same
as the physical address range.

 diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
 index 5cc0b0f..03a08bb 100644
 --- a/arch/arm/include/asm/kvm_mmu.h
 +++ b/arch/arm/include/asm/kvm_mmu.h
 @@ -21,6 +21,7 @@
 
  #include asm/memory.h
  #include asm/page.h
 +#include asm/kvm_arm.h
 
  /*
   * We directly use the kernel VA for the HYP, as we can directly share
 @@ -178,6 +179,18 @@ static inline void coherent_cache_guest_page(struct 
 kvm_vcpu *vcpu, hva_t hva,
 
  void stage2_flush_vm(struct kvm *kvm);
 
 +static inline int kvm_get_phys_addr_shift(void)
 +{
 +   return KVM_PHYS_SHIFT;
 +}
 +
 +
 +static inline u32 get_vttbr_baddr_mask(void)
 +{
 +   return VTTBR_BADDR_MASK;
 +}

It should be u64 as it truncates the top 32 bits of the mask.

 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 index a99e0cd..d0fca8f 100644
 --- a/arch/arm/kvm/arm.c
 +++ b/arch/arm/kvm/arm.c
 @@ -37,6 +37,7 @@
  #include asm/mman.h
  #include asm/tlbflush.h
  #include asm/cacheflush.h
 +#include asm/cputype.h
  #include asm/virt.h
  #include asm/kvm_arm.h
  #include asm/kvm_asm.h
 @@ -61,6 +62,12 @@ static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
  static u8 kvm_next_vmid;
  static DEFINE_SPINLOCK(kvm_vmid_lock);
 
 +#ifdef CONFIG_ARM64
 +static u64 vttbr_baddr_mask;
 +#else
 +static u32 vttbr_baddr_mask;
 +#endif

This mask should always be 64-bit as it relates to the physical address
which is 64-bit even on arm32 with LPAE enabled (same with the
get_vttbr_baddr_mask() return type above).

 diff --git a/arch/arm64/include/asm/kvm_arm.h 
 b/arch/arm64/include/asm/kvm_arm.h
 index cc83520..ff4a4fa 100644
 --- a/arch/arm64/include/asm/kvm_arm.h
 +++ b/arch/arm64/include/asm/kvm_arm.h
 @@ -95,7 +95,6 @@
  /* TCR_EL2 Registers bits */
  #define TCR_EL2_TBI(1  20)
  #define TCR_EL2_PS (7  16)
 -#define TCR_EL2_PS_40B (2  16)
  #define TCR_EL2_TG0(1  14)
  #define TCR_EL2_SH0(3  12)
  #define TCR_EL2_ORGN0  (3  10)
 @@ -104,8 +103,6 @@
  #define TCR_EL2_MASK   (TCR_EL2_TG0 | TCR_EL2_SH0 | \
  TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ)
 
 -#define TCR_EL2_FLAGS  (TCR_EL2_PS_40B)
 -
  /* VTCR_EL2 Registers bits */
  #define VTCR_EL2_PS_MASK   (7  16)
  #define VTCR_EL2_TG0_MASK  (1  14)
 @@ -120,36 +117,28 @@
  #define VTCR_EL2_SL0_MASK  (3  6)
  #define VTCR_EL2_SL0_LVL1  (1  6)
  #define VTCR_EL2_T0SZ_MASK 0x3f
 -#define VTCR_EL2_T0SZ_40B  24
 +#define VTCR_EL2_T0SZ(bits)(64 - (bits))
 
  #ifdef CONFIG_ARM64_64K_PAGES
  /*
   * Stage2 translation configuration:
 - * 40bits output (PS = 2)
 - * 40bits input  (T0SZ = 24)
   * 64kB pages (TG0 = 1)
   * 2 level page tables (SL = 1)
   */
  #define VTCR_EL2_FLAGS (VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER | \
  VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
 -VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
 -#define VTTBR_X(38 - VTCR_EL2_T0SZ_40B)
 +VTCR_EL2_SL0_LVL1)
  #else
  /*
   * Stage2 translation configuration:
 - * 40bits output (PS = 2)
 - * 40bits input  (T0SZ = 24)
   * 4kB pages (TG0 = 0)
   * 3 level page tables (SL = 1)
   */
  #define VTCR_EL2_FLAGS (VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | \
  VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
 -VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
 -#define VTTBR_X(37 - VTCR_EL2_T0SZ_40B)
 +VTCR_EL2_SL0_LVL1)
  #endif

The PS = 2 comment was misleading as it doesn't seem to be set here. But
why dropping T0SZ? That gives the input address for stage 2 (IPA), so
whether KVM exposes a 40-bit IPA to guests is independent of whether the
kernel would use 48-bit VA.

 

Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Nakajima, Jun
On Thu, Sep 18, 2014 at 6:28 PM, Andy Lutomirski l...@amacapital.net wrote:
 On Thu, Sep 18, 2014 at 6:03 PM, Andy Lutomirski l...@amacapital.net wrote:
 On Thu, Sep 18, 2014 at 5:49 PM, Nakajima, Jun jun.nakaj...@intel.com 
 wrote:
 On Thu, Sep 18, 2014 at 3:07 PM, Andy Lutomirski l...@amacapital.net 
 wrote:

 So, as a concrete straw-man:

 CPUID leaf 0x4800 would return a maximum leaf number in EAX (e.g.
 0x4801) along with a signature value (e.g. CrossHVPara\0) in
 EBX, ECX, and EDX.

 CPUID 0x4801.EAX would contain an MSR number to read to get a
 random number if supported and zero if not supported.

 Questions:

 1. Can we use a fixed MSR number?  This would be a little bit simpler,
 but it would depend on getting a wider MSR range from Intel.


 Why do you need a wider MSR range if you always detect the feature by
 CPUID.0x4801?
 Or are you still trying to avoid the detection by CPUID?

 Detecting the feature is one thing, but figuring out the MSR index is
 another.  We could shove the index into the cpuid leaf, but that seems
 unnecessarily indirect.  I'd much rather just say that CPUID leaves
 *and* MSR indexes 0x4800-0x4800 or so are reserved for the
 cross-HV mechanism, but we can't do that without either knowingly
 violating the SDM assignments or asking Intel to consider allocating
 more MSR indexes.

 Also, KVM is already conflicting with the SDM right now in its MSR
 choice :(  I *think* that KVM could be changed to fix that, but 256
 MSRs is rather confining given that KVM currently implements its own
 MSR index *and* part of the Hyper-V index.

 Correction and update:

 KVM currently implements its own MSRs and, optionally, some of the
 Hyper-V MSRs.  By my count, Linux knows about 68 Hyper-V MSRs (in a
 header file), and there are current 7 KVM MSRs, so over 1/4 of the
 available MSR indices are taken (and even more would be taken if KVM
 were to move its MSRs into the correct range).


I slept on it, and I think using the CPUID instruction alone would be
simple and efficient:
- We have a huge space for CPUID leaves
- CPUID also works for user-level
- It can take an additional 32-bit parameter (ECX), and returns 4
32-bit values (EAX, EBX, ECX, and EDX).  RDMSR, for example, returns a
64-bit value.

Basically we can use it to implement a hypercall (rather than VMCALL).

For example,
- CPUID 0x4801.EAX would return the feature presence (e.g. in
EBX), and the result in EDX:EAX (if present) at the same time, or
- CPUID 0x4801.EAX would return the feature presence only, and
CPUID 0x4802.EAX (acts like a hypercall) returns up to 4 32-bit
values.

-- 
Jun
Intel Open Source Technology Center
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Paolo Bonzini
Il 19/09/2014 18:14, Nakajima, Jun ha scritto:
 For example,
 - CPUID 0x4801.EAX would return the feature presence (e.g. in
 EBX), and the result in EDX:EAX (if present) at the same time, or
 - CPUID 0x4801.EAX would return the feature presence only, and
 CPUID 0x4802.EAX (acts like a hypercall) returns up to 4 32-bit
 values.

The latter is much better, because an unknown CPUID will return the
value of the highest leaf below 0x8000, and conflicts can happen easily.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread H. Peter Anvin
On 09/19/2014 09:37 AM, Gleb Natapov wrote:

 Linux detects what hypervior it runs on very early

Not anywhere close to early enough.  We're talking for uses like kASLR.

-hpa

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread H. Peter Anvin
On 09/19/2014 09:14 AM, Nakajima, Jun wrote:
 
 I slept on it, and I think using the CPUID instruction alone would be
 simple and efficient:
 - We have a huge space for CPUID leaves
 - CPUID also works for user-level
 - It can take an additional 32-bit parameter (ECX), and returns 4
 32-bit values (EAX, EBX, ECX, and EDX).  RDMSR, for example, returns a
 64-bit value.
 
 Basically we can use it to implement a hypercall (rather than VMCALL).
 
 For example,
 - CPUID 0x4801.EAX would return the feature presence (e.g. in
 EBX), and the result in EDX:EAX (if present) at the same time, or
 - CPUID 0x4801.EAX would return the feature presence only, and
 CPUID 0x4802.EAX (acts like a hypercall) returns up to 4 32-bit
 values.
 

There is a huge disadvantage to the fact that CPUID is a user space
instruction, though.

-hpa

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Gleb Natapov
On Thu, Sep 18, 2014 at 03:00:05PM -0700, Andy Lutomirski wrote:
 On Thu, Sep 18, 2014 at 2:46 PM, David Hepkin david...@microsoft.com wrote:
  I suggest we come to consensus on a specific CPUID leaf where an OS needs 
  to look to determine if a hypervisor supports this capability.  We could 
  define a new CPUID leaf range at a well-defined location, or we could just 
  use one of the existing CPUID leaf ranges implemented by an existing 
  hypervisor.  I'm not familiar with the KVM CPUID leaf range, but in the 
  case of Hyper-V, the Hyper-V CPUID leaf range was architected to allow for 
  other hypervisors to implement it and just show through specific 
  capabilities supported by the hypervisor.  So, we could define a bit in the 
  Hyper-V CPUID leaf range (since Xen and KVM also implement this range), but 
  that would require Linux to look in that range on boot to discover this 
  capability.
 
 I also don't know whether QEMU and KVM would be okay with implementing
 the host side of the Hyper-V mechanism by default.  They would have to
 implement at least leaves 0x4001 and 0x402, plus correctly
 reporting zeros through whatever leaf is used for this new feature.
 Gleb?  Paolo?
 
KVM and any other hypervisor out there already implement capability
detection mechanism in 0x4000 range, and of course all of them do
it differently. Linux detects what hypervior it runs on very early and
switch to correspondent code to handle each hypervisor. Quite frankly
I do not see what problem you are trying to fix with standardizing MSR
to get RND and detection mechanism for this MSR. RND MSR is in no way
unique here. There are other mechanisms that are virtually identical
between hypervisors but have different gust/hypervisor interfaces and
are detected differently on different hypervisors. Examples are pvclock,
pveoi may be others.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Gleb Natapov
On Fri, Sep 19, 2014 at 09:40:07AM -0700, H. Peter Anvin wrote:
 On 09/19/2014 09:37 AM, Gleb Natapov wrote:
 
  Linux detects what hypervior it runs on very early
 
 Not anywhere close to early enough.  We're talking for uses like kASLR.
 
Still to early to do:

   h = cpuid(HYPERVIOR_SIGNATURE)
   if (h == KVMKVMKVM) {
  if (cpuid(kvm_features)  kvm_rnd)
 rdmsr(kvm_rnd)
   else (h == HyperV) {
  if (cpuid(hv_features)  hv_rnd)
rdmsr(hv_rnd) 
   else (h == XenXenXen) {
  if (cpuid(xen_features)  xen_rnd)
rdmsr(xen_rnd)
  }

?

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [edk2] [Qemu-devel] [INVITE] OVMF BoF session at the KVM Forum 2014

2014-09-19 Thread Paolo Bonzini
Il 19/09/2014 16:17, Ard Biesheuvel ha scritto:
 
  (**) Ard's patches for the upstream host kernel (== KVM) have been...
  ugh, not sure... applied to a maintainer tree? Ard? :)
 
 Some are in kvm/master, which I think means to should go into the next
 3.17-rc, although I haven't seen much movement there.

They'll miss this week's rc, but I'll send them out early next week.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread H. Peter Anvin
On 09/19/2014 09:53 AM, Gleb Natapov wrote:
 On Fri, Sep 19, 2014 at 09:40:07AM -0700, H. Peter Anvin wrote:
 On 09/19/2014 09:37 AM, Gleb Natapov wrote:

 Linux detects what hypervior it runs on very early

 Not anywhere close to early enough.  We're talking for uses like kASLR.

 Still to early to do:
 
h = cpuid(HYPERVIOR_SIGNATURE)
if (h == KVMKVMKVM) {
   if (cpuid(kvm_features)  kvm_rnd)
  rdmsr(kvm_rnd)
else (h == HyperV) {
   if (cpuid(hv_features)  hv_rnd)
 rdmsr(hv_rnd) 
else (h == XenXenXen) {
   if (cpuid(xen_features)  xen_rnd)
 rdmsr(xen_rnd)
   }
 

If we need to do chase loops, especially not so...

-hpa


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Gleb Natapov
On Fri, Sep 19, 2014 at 10:08:20AM -0700, H. Peter Anvin wrote:
 On 09/19/2014 09:53 AM, Gleb Natapov wrote:
  On Fri, Sep 19, 2014 at 09:40:07AM -0700, H. Peter Anvin wrote:
  On 09/19/2014 09:37 AM, Gleb Natapov wrote:
 
  Linux detects what hypervior it runs on very early
 
  Not anywhere close to early enough.  We're talking for uses like kASLR.
 
  Still to early to do:
  
 h = cpuid(HYPERVIOR_SIGNATURE)
 if (h == KVMKVMKVM) {
if (cpuid(kvm_features)  kvm_rnd)
   rdmsr(kvm_rnd)
 else (h == HyperV) {
if (cpuid(hv_features)  hv_rnd)
  rdmsr(hv_rnd) 
 else (h == XenXenXen) {
if (cpuid(xen_features)  xen_rnd)
  rdmsr(xen_rnd)
}
  
 
 If we need to do chase loops, especially not so...
 
What loops exactly? As a non native English speaker I fail to understand
if your answer is yes or no ;)

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread H. Peter Anvin
On 09/19/2014 10:15 AM, Gleb Natapov wrote:
 On Fri, Sep 19, 2014 at 10:08:20AM -0700, H. Peter Anvin wrote:
 On 09/19/2014 09:53 AM, Gleb Natapov wrote:
 On Fri, Sep 19, 2014 at 09:40:07AM -0700, H. Peter Anvin wrote:
 On 09/19/2014 09:37 AM, Gleb Natapov wrote:

 Linux detects what hypervior it runs on very early

 Not anywhere close to early enough.  We're talking for uses like kASLR.

 Still to early to do:

h = cpuid(HYPERVIOR_SIGNATURE)
if (h == KVMKVMKVM) {
   if (cpuid(kvm_features)  kvm_rnd)
  rdmsr(kvm_rnd)
else (h == HyperV) {
   if (cpuid(hv_features)  hv_rnd)
 rdmsr(hv_rnd) 
else (h == XenXenXen) {
   if (cpuid(xen_features)  xen_rnd)
 rdmsr(xen_rnd)
   }


 If we need to do chase loops, especially not so...

 What loops exactly? As a non native English speaker I fail to understand
 if your answer is yes or no ;)
 

The above isn't actually the full algorithm used.

-hpa


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread H. Peter Anvin
On 09/19/2014 10:15 AM, Gleb Natapov wrote:
 On Fri, Sep 19, 2014 at 10:08:20AM -0700, H. Peter Anvin wrote:
 On 09/19/2014 09:53 AM, Gleb Natapov wrote:
 On Fri, Sep 19, 2014 at 09:40:07AM -0700, H. Peter Anvin wrote:
 On 09/19/2014 09:37 AM, Gleb Natapov wrote:

 Linux detects what hypervior it runs on very early

 Not anywhere close to early enough.  We're talking for uses like kASLR.

 Still to early to do:

h = cpuid(HYPERVIOR_SIGNATURE)
if (h == KVMKVMKVM) {
   if (cpuid(kvm_features)  kvm_rnd)
  rdmsr(kvm_rnd)
else (h == HyperV) {
   if (cpuid(hv_features)  hv_rnd)
 rdmsr(hv_rnd) 
else (h == XenXenXen) {
   if (cpuid(xen_features)  xen_rnd)
 rdmsr(xen_rnd)
   }


 If we need to do chase loops, especially not so...

 What loops exactly? As a non native English speaker I fail to understand
 if your answer is yes or no ;)
 

The above isn't actually the full algorithm used.

-hpa


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Andy Lutomirski
On Sep 19, 2014 9:40 AM, H. Peter Anvin h...@zytor.com wrote:

 On 09/19/2014 09:14 AM, Nakajima, Jun wrote:
 
  I slept on it, and I think using the CPUID instruction alone would be
  simple and efficient:
  - We have a huge space for CPUID leaves
  - CPUID also works for user-level
  - It can take an additional 32-bit parameter (ECX), and returns 4
  32-bit values (EAX, EBX, ECX, and EDX).  RDMSR, for example, returns a
  64-bit value.
 
  Basically we can use it to implement a hypercall (rather than VMCALL).
 
  For example,
  - CPUID 0x4801.EAX would return the feature presence (e.g. in
  EBX), and the result in EDX:EAX (if present) at the same time, or
  - CPUID 0x4801.EAX would return the feature presence only, and
  CPUID 0x4802.EAX (acts like a hypercall) returns up to 4 32-bit
  values.
 

 There is a huge disadvantage to the fact that CPUID is a user space
 instruction, though.

We can always make cpuid on the leaf in question return all zeros if CPL  0.


 -hpa

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Andy Lutomirski
On Sep 19, 2014 9:53 AM, Gleb Natapov g...@kernel.org wrote:

 On Fri, Sep 19, 2014 at 09:40:07AM -0700, H. Peter Anvin wrote:
  On 09/19/2014 09:37 AM, Gleb Natapov wrote:
  
   Linux detects what hypervior it runs on very early
 
  Not anywhere close to early enough.  We're talking for uses like kASLR.
 
 Still to early to do:

h = cpuid(HYPERVIOR_SIGNATURE)
if (h == KVMKVMKVM) {
   if (cpuid(kvm_features)  kvm_rnd)
  rdmsr(kvm_rnd)
else (h == HyperV) {
   if (cpuid(hv_features)  hv_rnd)
 rdmsr(hv_rnd)
else (h == XenXenXen) {
   if (cpuid(xen_features)  xen_rnd)
 rdmsr(xen_rnd)
   }


I think that there's a lot of value in having each guest
implementation be automatically compatible with all hypervisors.  For
example, you forgot VMware, and VMware might be less excited about
implementing this feature if all the guests won't immediately start
using it.

 ?

 --
 Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v6] arm64: fix VTTBR_BADDR_MASK

2014-09-19 Thread Catalin Marinas
On Fri, Sep 19, 2014 at 04:28:54PM +0100, Catalin Marinas wrote:
 On Tue, Sep 09, 2014 at 12:08:52AM +0100, Joel Schopp wrote:
  The current VTTBR_BADDR_MASK only masks 39 bits, which is broken on current
  systems.  Rather than just add a bit it seems like a good time to also set
  things at run-time instead of compile time to accomodate more hardware.
  
  This patch sets TCR_EL2.PS, VTCR_EL2.T0SZ and vttbr_baddr_mask in runtime,
  not compile time.
  
  In ARMv8, EL2 physical address size (TCR_EL2.PS) and stage2 input address
  size (VTCR_EL2.T0SZE) cannot be determined in compile time since they
  depend on hardware capability.
  
  According to Table D4-23 and Table D4-25 in ARM DDI 0487A.b document,
  vttbr_x is calculated using different fixed values with consideration
  of T0SZ, granule size and the level of translation tables. Therefore,
  vttbr_baddr_mask should be determined dynamically.
 
 So I agree with vttbr_baddr_mask being determined dynamically. I also
 agree with setting TCR_EL2.PS at run-time but the VTCR_EL2.T0SZ
 determines the input of the stage 2 translation. That's a KVM
 configuration about what IPA size it provides to the guests (and
 platform model it intends to support) and it doesn't need to be the same
 as the physical address range.
[...]
  -#define VTTBR_BADDR_SHIFT (VTTBR_X - 1)
  -#define VTTBR_BADDR_MASK  (((1LLU  (40 - VTTBR_X)) - 1)  
  VTTBR_BADDR_SHIFT)

Actually, after some more thinking, why don't we just make the upper
limit of this mask 48-bit always or even 64-bit. That's a physical mask
for checking whether the pgd pointer in vttbr is aligned as per the
architecture requirements. Given that the pointer is allocated from the
platform memory, it's clear that it is within the PA range. So basically
you just need a mask to check the bottom alignment based on
VTCR_EL2.T0SZ (which should be independent from the PA range). I guess
it should be enough as:

#define VTTBR_BADDR_MASK  (~0ULL  VTTBR_BADDR_SHIFT)

without any other changes to T0SZ.

The TCR_EL2.PS setting should be done based on the ID_A64MMFR0_EL1
but you can do this in __do_hyp_init (it looks like this function
handles VTCR_EL2.PS already, not sure why it does do it for TCR_EL2 as
well).

So IMO you only need about a few lines patch.

-- 
Catalin
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread H. Peter Anvin
On 09/19/2014 10:21 AM, Andy Lutomirski wrote:

 There is a huge disadvantage to the fact that CPUID is a user space
 instruction, though.
 
 We can always make cpuid on the leaf in question return all zeros if CPL  0.
 

Not sure that is better...

-hpa


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Andy Lutomirski
On Fri, Sep 19, 2014 at 10:36 AM, H. Peter Anvin h...@zytor.com wrote:
 On 09/19/2014 10:21 AM, Andy Lutomirski wrote:

 There is a huge disadvantage to the fact that CPUID is a user space
 instruction, though.

 We can always make cpuid on the leaf in question return all zeros if CPL  0.


 Not sure that is better...

It's better than #GP...

This is why I prefer rdmsr: the privilege semantics are already
appropriate.  Also, I wouldn't be surprised if shoehorning
non-constant results into cpuid implementations might be awkward for
some hypervisors.

--Andy
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Gleb Natapov
On Fri, Sep 19, 2014 at 10:18:37AM -0700, H. Peter Anvin wrote:
 On 09/19/2014 10:15 AM, Gleb Natapov wrote:
  On Fri, Sep 19, 2014 at 10:08:20AM -0700, H. Peter Anvin wrote:
  On 09/19/2014 09:53 AM, Gleb Natapov wrote:
  On Fri, Sep 19, 2014 at 09:40:07AM -0700, H. Peter Anvin wrote:
  On 09/19/2014 09:37 AM, Gleb Natapov wrote:
 
  Linux detects what hypervior it runs on very early
 
  Not anywhere close to early enough.  We're talking for uses like kASLR.
 
  Still to early to do:
 
 h = cpuid(HYPERVIOR_SIGNATURE)
 if (h == KVMKVMKVM) {
if (cpuid(kvm_features)  kvm_rnd)
   rdmsr(kvm_rnd)
 else (h == HyperV) {
if (cpuid(hv_features)  hv_rnd)
  rdmsr(hv_rnd) 
 else (h == XenXenXen) {
if (cpuid(xen_features)  xen_rnd)
  rdmsr(xen_rnd)
}
 
 
  If we need to do chase loops, especially not so...
 
  What loops exactly? As a non native English speaker I fail to understand
  if your answer is yes or no ;)
  
 
 The above isn't actually the full algorithm used.
 
What part of actually algorithm cannot be implemented? Loop that searches
for KVM leaf in case KVM pretend to be HyperV (is this what you called
chase loops?)? First of all there is no need to implement it, if KVM
pretends to be HyperV use HyperV's way to obtain RNG, but what is the
problem with the loop?

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Gleb Natapov
On Fri, Sep 19, 2014 at 10:21:27AM -0700, Andy Lutomirski wrote:
 On Sep 19, 2014 9:53 AM, Gleb Natapov g...@kernel.org wrote:
 
  On Fri, Sep 19, 2014 at 09:40:07AM -0700, H. Peter Anvin wrote:
   On 09/19/2014 09:37 AM, Gleb Natapov wrote:
   
Linux detects what hypervior it runs on very early
  
   Not anywhere close to early enough.  We're talking for uses like kASLR.
  
  Still to early to do:
 
 h = cpuid(HYPERVIOR_SIGNATURE)
 if (h == KVMKVMKVM) {
if (cpuid(kvm_features)  kvm_rnd)
   rdmsr(kvm_rnd)
 else (h == HyperV) {
if (cpuid(hv_features)  hv_rnd)
  rdmsr(hv_rnd)
 else (h == XenXenXen) {
if (cpuid(xen_features)  xen_rnd)
  rdmsr(xen_rnd)
}
 
 
 I think that there's a lot of value in having each guest
 implementation be automatically compatible with all hypervisors.  For
 example, you forgot VMware, and VMware might be less excited about
 implementing this feature if all the guests won't immediately start
 using it.
 
I forgot VMware because I do not see VMware people to be CCed. They may
be even less excited about them being told _how_ this feature need to be
implemented (e.g implement HyperV leafs for the feature detection). I
do not want to and cannot speak for VMware, but my guess is that for
them it would be much easier to add an else clause for VMware in above
if then to coordinate with all hypervisor developers about MSR/cpuid
details. And since this is security feature implementing it for Linux
is in their best interest.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Andy Lutomirski
On Fri, Sep 19, 2014 at 10:49 AM, Gleb Natapov g...@kernel.org wrote:
 On Fri, Sep 19, 2014 at 10:18:37AM -0700, H. Peter Anvin wrote:
 On 09/19/2014 10:15 AM, Gleb Natapov wrote:
  On Fri, Sep 19, 2014 at 10:08:20AM -0700, H. Peter Anvin wrote:
  On 09/19/2014 09:53 AM, Gleb Natapov wrote:
  On Fri, Sep 19, 2014 at 09:40:07AM -0700, H. Peter Anvin wrote:
  On 09/19/2014 09:37 AM, Gleb Natapov wrote:
 
  Linux detects what hypervior it runs on very early
 
  Not anywhere close to early enough.  We're talking for uses like kASLR.
 
  Still to early to do:
 
 h = cpuid(HYPERVIOR_SIGNATURE)
 if (h == KVMKVMKVM) {
if (cpuid(kvm_features)  kvm_rnd)
   rdmsr(kvm_rnd)
 else (h == HyperV) {
if (cpuid(hv_features)  hv_rnd)
  rdmsr(hv_rnd)
 else (h == XenXenXen) {
if (cpuid(xen_features)  xen_rnd)
  rdmsr(xen_rnd)
}
 
 
  If we need to do chase loops, especially not so...
 
  What loops exactly? As a non native English speaker I fail to understand
  if your answer is yes or no ;)
 

 The above isn't actually the full algorithm used.

 What part of actually algorithm cannot be implemented? Loop that searches
 for KVM leaf in case KVM pretend to be HyperV (is this what you called
 chase loops?)? First of all there is no need to implement it, if KVM
 pretends to be HyperV use HyperV's way to obtain RNG, but what is the
 problem with the loop?


It can be implemented, and I've done it.  But it's a mess.  Almost the
very first thing we do in boot (even before decompressing the kernel)
will be to scan a bunch of cpuid leaves looking for a hypervisor with
an rng source that we can use for kASLR.  And we'll have to update
that code and make it bigger every time another hypervisor adds
exactly the same feature.

And then we have another copy of almost exactly the same code in the
normal post-boot part of the kernel.

We can certainly do this, but I'd much rather solve the problem once
and let all of the hypervisors and guests opt in and immediately be
compatible with each other.

 I forgot VMware because I do not see VMware people to be CCed. They may
 be even less excited about them being told _how_ this feature need to be
 implemented (e.g implement HyperV leafs for the feature detection). I
 do not want to and cannot speak for VMware, but my guess is that for
 them it would be much easier to add an else clause for VMware in above
 if then to coordinate with all hypervisor developers about MSR/cpuid
 details. And since this is security feature implementing it for Linux
 is in their best interest.

Do you know any of them who should be cc'd?

--Andy
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [edk2] [Qemu-devel] [INVITE] OVMF BoF session at the KVM Forum 2014

2014-09-19 Thread Ard Biesheuvel
On 19 September 2014 10:03, Paolo Bonzini pbonz...@redhat.com wrote:
 Il 19/09/2014 16:17, Ard Biesheuvel ha scritto:

  (**) Ard's patches for the upstream host kernel (== KVM) have been...
  ugh, not sure... applied to a maintainer tree? Ard? :)
 
 Some are in kvm/master, which I think means to should go into the next
 3.17-rc, although I haven't seen much movement there.

 They'll miss this week's rc, but I'll send them out early next week.


OK thanks for the update


 --
 Slashdot TV.  Video for Nerds.  Stuff that Matters.
 http://pubads.g.doubleclick.net/gampad/clk?id=160591471iu=/4140/ostg.clktrk
 ___
 edk2-devel mailing list
 edk2-de...@lists.sourceforge.net
 https://lists.sourceforge.net/lists/listinfo/edk2-devel
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Gleb Natapov
On Fri, Sep 19, 2014 at 11:02:38AM -0700, Andy Lutomirski wrote:
 On Fri, Sep 19, 2014 at 10:49 AM, Gleb Natapov g...@kernel.org wrote:
  On Fri, Sep 19, 2014 at 10:18:37AM -0700, H. Peter Anvin wrote:
  On 09/19/2014 10:15 AM, Gleb Natapov wrote:
   On Fri, Sep 19, 2014 at 10:08:20AM -0700, H. Peter Anvin wrote:
   On 09/19/2014 09:53 AM, Gleb Natapov wrote:
   On Fri, Sep 19, 2014 at 09:40:07AM -0700, H. Peter Anvin wrote:
   On 09/19/2014 09:37 AM, Gleb Natapov wrote:
  
   Linux detects what hypervior it runs on very early
  
   Not anywhere close to early enough.  We're talking for uses like 
   kASLR.
  
   Still to early to do:
  
  h = cpuid(HYPERVIOR_SIGNATURE)
  if (h == KVMKVMKVM) {
 if (cpuid(kvm_features)  kvm_rnd)
rdmsr(kvm_rnd)
  else (h == HyperV) {
 if (cpuid(hv_features)  hv_rnd)
   rdmsr(hv_rnd)
  else (h == XenXenXen) {
 if (cpuid(xen_features)  xen_rnd)
   rdmsr(xen_rnd)
 }
  
  
   If we need to do chase loops, especially not so...
  
   What loops exactly? As a non native English speaker I fail to understand
   if your answer is yes or no ;)
  
 
  The above isn't actually the full algorithm used.
 
  What part of actually algorithm cannot be implemented? Loop that searches
  for KVM leaf in case KVM pretend to be HyperV (is this what you called
  chase loops?)? First of all there is no need to implement it, if KVM
  pretends to be HyperV use HyperV's way to obtain RNG, but what is the
  problem with the loop?
 
 
 It can be implemented, and I've done it.  But it's a mess.  Almost the
 very first thing we do in boot (even before decompressing the kernel)
 will be to scan a bunch of cpuid leaves looking for a hypervisor with
 an rng source that we can use for kASLR.  And we'll have to update
 that code and make it bigger every time another hypervisor adds
 exactly the same feature.
IMO implementing this feature is in hypervisor's best interest, so the task
of updating the code will scale by virtue of hypervisor's developers each
adding it for hypervisor he cares about.

 
 And then we have another copy of almost exactly the same code in the
 normal post-boot part of the kernel.
 
 We can certainly do this, but I'd much rather solve the problem once
 and let all of the hypervisors and guests opt in and immediately be
 compatible with each other.
 
  I forgot VMware because I do not see VMware people to be CCed. They may
  be even less excited about them being told _how_ this feature need to be
  implemented (e.g implement HyperV leafs for the feature detection). I
  do not want to and cannot speak for VMware, but my guess is that for
  them it would be much easier to add an else clause for VMware in above
  if then to coordinate with all hypervisor developers about MSR/cpuid
  details. And since this is security feature implementing it for Linux
  is in their best interest.
 
 Do you know any of them who should be cc'd?
 
No, not anyone in particular. git log arch/x86/kernel/cpu/vmware.c may help.

But VMware is an elephant in the room here. There are other hypervisors out 
there.
VirtualBox, bhyve...

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Andy Lutomirski
[cc: Alok Kataria at VMware]

On Fri, Sep 19, 2014 at 11:12 AM, Gleb Natapov g...@kernel.org wrote:
 On Fri, Sep 19, 2014 at 11:02:38AM -0700, Andy Lutomirski wrote:
 On Fri, Sep 19, 2014 at 10:49 AM, Gleb Natapov g...@kernel.org wrote:
  On Fri, Sep 19, 2014 at 10:18:37AM -0700, H. Peter Anvin wrote:
  On 09/19/2014 10:15 AM, Gleb Natapov wrote:
   On Fri, Sep 19, 2014 at 10:08:20AM -0700, H. Peter Anvin wrote:
   On 09/19/2014 09:53 AM, Gleb Natapov wrote:
   On Fri, Sep 19, 2014 at 09:40:07AM -0700, H. Peter Anvin wrote:
   On 09/19/2014 09:37 AM, Gleb Natapov wrote:
  
   Linux detects what hypervior it runs on very early
  
   Not anywhere close to early enough.  We're talking for uses like 
   kASLR.
  
   Still to early to do:
  
  h = cpuid(HYPERVIOR_SIGNATURE)
  if (h == KVMKVMKVM) {
 if (cpuid(kvm_features)  kvm_rnd)
rdmsr(kvm_rnd)
  else (h == HyperV) {
 if (cpuid(hv_features)  hv_rnd)
   rdmsr(hv_rnd)
  else (h == XenXenXen) {
 if (cpuid(xen_features)  xen_rnd)
   rdmsr(xen_rnd)
 }
  
  
   If we need to do chase loops, especially not so...
  
   What loops exactly? As a non native English speaker I fail to understand
   if your answer is yes or no ;)
  
 
  The above isn't actually the full algorithm used.
 
  What part of actually algorithm cannot be implemented? Loop that searches
  for KVM leaf in case KVM pretend to be HyperV (is this what you called
  chase loops?)? First of all there is no need to implement it, if KVM
  pretends to be HyperV use HyperV's way to obtain RNG, but what is the
  problem with the loop?
 

 It can be implemented, and I've done it.  But it's a mess.  Almost the
 very first thing we do in boot (even before decompressing the kernel)
 will be to scan a bunch of cpuid leaves looking for a hypervisor with
 an rng source that we can use for kASLR.  And we'll have to update
 that code and make it bigger every time another hypervisor adds
 exactly the same feature.
 IMO implementing this feature is in hypervisor's best interest, so the task
 of updating the code will scale by virtue of hypervisor's developers each
 adding it for hypervisor he cares about.

I assume that you mean guest, not hypervisor.



 And then we have another copy of almost exactly the same code in the
 normal post-boot part of the kernel.

 We can certainly do this, but I'd much rather solve the problem once
 and let all of the hypervisors and guests opt in and immediately be
 compatible with each other.

  I forgot VMware because I do not see VMware people to be CCed. They may
  be even less excited about them being told _how_ this feature need to be
  implemented (e.g implement HyperV leafs for the feature detection). I
  do not want to and cannot speak for VMware, but my guess is that for
  them it would be much easier to add an else clause for VMware in above
  if then to coordinate with all hypervisor developers about MSR/cpuid
  details. And since this is security feature implementing it for Linux
  is in their best interest.

 Do you know any of them who should be cc'd?

 No, not anyone in particular. git log arch/x86/kernel/cpu/vmware.c may help.

 But VMware is an elephant in the room here. There are other hypervisors out 
 there.
 VirtualBox, bhyve...

Exactly.  The amount of effort to get everything to be compatible with
everything scales quadratically in the number of hypervisors, and the
probability that some combination is broken also increases.

If we can get everyone to back something common here then this problem
goes away.

--Andy


 --
 Gleb.



-- 
Andy Lutomirski
AMA Capital Management, LLC
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Christopher Covington
On 09/17/2014 10:50 PM, Andy Lutomirski wrote:
 Hi all-
 
 I would like to standardize on a very simple protocol by which a guest
 OS can obtain an RNG seed early in boot.
 
 The main design requirements are:
 
  - The interface should be very easy to use.  Linux, at least, will
 want to use it extremely early in boot as part of kernel ASLR.  This
 means that PCI and ACPI will not work.

How do non-virtual systems get entropy this early? RDRAND/Padlock? Truerand?
Could hypervisors and simulators simply make sure these work?

Christopher

-- 
Employee of Qualcomm Innovation Center, Inc.
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by the Linux Foundation.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Andy Lutomirski
On Fri, Sep 19, 2014 at 11:30 AM, Christopher Covington
c...@codeaurora.org wrote:
 On 09/17/2014 10:50 PM, Andy Lutomirski wrote:
 Hi all-

 I would like to standardize on a very simple protocol by which a guest
 OS can obtain an RNG seed early in boot.

 The main design requirements are:

  - The interface should be very easy to use.  Linux, at least, will
 want to use it extremely early in boot as part of kernel ASLR.  This
 means that PCI and ACPI will not work.

 How do non-virtual systems get entropy this early? RDRAND/Padlock? Truerand?
 Could hypervisors and simulators simply make sure these work?


If RDRAND is available, then Linux, at least, will use it.  The rest
are too complicated for early use.  Linux on x86 plays some vaguely
clever games with rdtsc and poking at the i8254 port.

I think that these tricks are even less useful as a guest than they
are on metal, and we can use paravirt mechanisms to make guest early
boot rngs much stronger.

--Andy
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [[RFC] KVM-S390: Provide guest TOD Clock Get/Set Controls

2014-09-19 Thread Christian Borntraeger
On 09/19/2014 04:19 PM, Jason J. Herne wrote:
 From: Jason J. Herne jjhe...@us.ibm.com
 
 Enable KVM_SET_CLOCK and KVM_GET_CLOCK Ioctls on S390 for managing guest TOD
 clock value.
 

Just some education. On s390 the guest visible TOD clock is thehost TOD clock + 
hypervisor programmable offset in the control block. There is only one TOD per 
system, so the offset must be the same for every CPU.

 we add the KVM_CLOCK_FORWARD_ONLY flag to indicate to KVM_SET_CLOCK that the
 given clock value should only be set if it is = the current guest TOD clock
   host TOD, 
(right?)

The alternative scheme would be to simply get/set the guest TOD time. This 
works perfect for migration, but for managedsave the guest time is in the past.
Your approach has the advantange that after managedsave the guest will (most of 
the time) have the host time of the target system, avoiding that the guest has 
a time that is in the past (e.g. after 1 week managedsave the guest would live 
in the past).

Question for Paolo (maybe others) is. Does it make sense to reuse/extend the 
existing ioctl (I think so, but defining a new one could also be ok)

Christian



 value. This guarantees a monotonically increasing time.
 
 NOTE: In the event that the KVM_CLOCK_FORWARD_ONLY flag is set and the given
 time would cause the guest time to jump backward, then we set the guest TOD
 clock equal to the host TOD clock. Does this behavior make sense, or is it too
 weird? I could believe that other architectures might not want this exact
 behavior. Instead they might prefer to implement the function such that an
 error code is returned instead of syncing the guest time to host time? In that
 case S390 would need another bit KVM_CLOCK_SET_TO_HOST which we could call to
 sync host time when the preferred guest time value would otherwise violate
 the monotonic property of the KVM_CLOCK_FORWARD_ONLY flag.
 
 Signed-off-by: Jason J. Herne jjhe...@us.ibm.com
 ---
  Documentation/virtual/kvm/api.txt |  5 +++
  arch/s390/kvm/kvm-s390.c  | 80 
 +++
  include/uapi/linux/kvm.h  |  3 ++
  3 files changed, 88 insertions(+)
 
 diff --git a/Documentation/virtual/kvm/api.txt 
 b/Documentation/virtual/kvm/api.txt
 index beae3fd..615c2e4 100644
 --- a/Documentation/virtual/kvm/api.txt
 +++ b/Documentation/virtual/kvm/api.txt
 @@ -779,6 +779,11 @@ struct kvm_clock_data {
   __u32 pad[9];
  };
 
 +S390: KVM_CLOCK_FORWARD_ONLY is used by KVM_SET_CLOCK to indicate that the 
 guest
 +TOD clock should not be allowed to jump back in time. This flag guarantees a
 +monotonically increasing guest clock. If the clock value specified would 
 cause
 +the guest to jump back in time then the guest TOD clock is set to the host
 +TOD clock value.
 
  4.31 KVM_GET_VCPU_EVENTS
 
 diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
 index 81b0e11..2450db3 100644
 --- a/arch/s390/kvm/kvm-s390.c
 +++ b/arch/s390/kvm/kvm-s390.c
 @@ -31,6 +31,7 @@
  #include asm/switch_to.h
  #include asm/facility.h
  #include asm/sclp.h
 +#includeasm/timex.h
  #include kvm-s390.h
  #include gaccess.h
 
 @@ -169,6 +170,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
 ext)
   case KVM_CAP_S390_IRQCHIP:
   case KVM_CAP_VM_ATTRIBUTES:
   case KVM_CAP_MP_STATE:
 + case KVM_CAP_ADJUST_CLOCK:
   r = 1;
   break;
   case KVM_CAP_NR_VCPUS:
 @@ -338,6 +340,63 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct 
 kvm_device_attr *attr)
   return ret;
  }
 
 +static int kvm_s390_get_guest_tod(struct kvm *kvm, struct kvm_clock_data 
 *user_clock)
 +{
 + u64 current_host_tod;
 + u64 epoch = 0;
 + struct kvm_vcpu *vcpu;
 + unsigned int vcpu_idx;
 + int r;
 +
 + /* All vcpu's epochs are in sync. Just Grab the 1st one */
 + kvm_for_each_vcpu(vcpu_idx, vcpu, kvm)
 + {
 + epoch = vcpu-arch.sie_block-epoch;
 + break;
 + }
 +
 + r = store_tod_clock(current_host_tod);
 + if (r)
 + return r;
 +
 + user_clock-clock = current_host_tod + epoch;
 + return 0;
 +}
 +
 +/*
 +Set the guest's effective TOD clock to the given value. The guest's
 +TOD clock is determined by the following formula: gtod = host_tod + epoch.
 +NOTE: Even though the epoch value is associated with a vcpu, there is only
 +one TOD clock and epoch value per guest.  All vcpu's epoch values must be 
 kept
 +synchronized.
 +NOTE: The KVM_CLOCK_FORWARD_ONLY flag is used to indicate that the guest 
 clock
 +should only be set to the provided value if doing so does not cause guest 
 time
 +to jump backwards. In this case we zero the epoch thereby making the guest 
 TOD
 +clock equal to the host TOD clock.
 +*/
 +static int kvm_s390_set_guest_tod(struct kvm *kvm, struct kvm_clock_data 
 *user_clock)
 +{
 + u64 current_host_tod, epoch;
 + struct kvm_vcpu *vcpu;
 + unsigned int 

Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Nadav Amit

On Sep 19, 2014, at 9:42 PM, Andy Lutomirski l...@amacapital.net wrote:

 On Fri, Sep 19, 2014 at 11:30 AM, Christopher Covington
 c...@codeaurora.org wrote:
 On 09/17/2014 10:50 PM, Andy Lutomirski wrote:
 Hi all-
 
 I would like to standardize on a very simple protocol by which a guest
 OS can obtain an RNG seed early in boot.
 
 The main design requirements are:
 
 - The interface should be very easy to use.  Linux, at least, will
 want to use it extremely early in boot as part of kernel ASLR.  This
 means that PCI and ACPI will not work.
 
 How do non-virtual systems get entropy this early? RDRAND/Padlock? Truerand?
 Could hypervisors and simulators simply make sure these work?
 
 
 If RDRAND is available, then Linux, at least, will use it.  The rest
 are too complicated for early use.  Linux on x86 plays some vaguely
 clever games with rdtsc and poking at the i8254 port.
 
 I think that these tricks are even less useful as a guest than they
 are on metal, and we can use paravirt mechanisms to make guest early
 boot rngs much stronger.

Sorry for interrupting, as I understand the discussion tries to be generic.

However, it sounds to me that at least for KVM, it is very easy just to emulate 
the RDRAND instruction. The hypervisor would report to the guest that RDRAND is 
supported in CPUID and the emulate the instruction when guest executes it. KVM 
already traps guest #UD (which would occur if RDRAND executed while it is not 
supported) - so this scheme wouldn’t introduce additional overhead over RDMSR.

Nadav--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [[RFC] KVM-S390: Provide guest TOD Clock Get/Set Controls

2014-09-19 Thread Alexander Graf


On 19.09.14 20:51, Christian Borntraeger wrote:
 On 09/19/2014 04:19 PM, Jason J. Herne wrote:
 From: Jason J. Herne jjhe...@us.ibm.com

 Enable KVM_SET_CLOCK and KVM_GET_CLOCK Ioctls on S390 for managing guest TOD
 clock value.

 
 Just some education. On s390 the guest visible TOD clock is thehost TOD clock 
 + hypervisor programmable offset in the control block. There is only one TOD 
 per system, so the offset must be the same for every CPU.

Can that offset be negative?

 
 we add the KVM_CLOCK_FORWARD_ONLY flag to indicate to KVM_SET_CLOCK that the
 given clock value should only be set if it is = the current guest TOD clock
host TOD, 
 (right?)
 
 The alternative scheme would be to simply get/set the guest TOD time. This 
 works perfect for migration, but for managedsave the guest time is in the 
 past.
 Your approach has the advantange that after managedsave the guest will (most 
 of the time) have the host time of the target system, avoiding that the guest 
 has a time that is in the past (e.g. after 1 week managedsave the guest would 
 live in the past).

But that's what users will expect, no? When you save an image in the
past, it should resume at that very point in time.

Also I personally don't care whether the interface is delta to now or
this is the time. In general, delta to now is safer because you
can't possibly run back in time. But you also definitely want to check
out the way PPC does it - it also accomodates for the time we spend
inside the migration path itself.


Alex

 
 Question for Paolo (maybe others) is. Does it make sense to reuse/extend the 
 existing ioctl (I think so, but defining a new one could also be ok)
 
 Christian
 
 
 
 value. This guarantees a monotonically increasing time.

 NOTE: In the event that the KVM_CLOCK_FORWARD_ONLY flag is set and the given
 time would cause the guest time to jump backward, then we set the guest TOD
 clock equal to the host TOD clock. Does this behavior make sense, or is it 
 too
 weird? I could believe that other architectures might not want this exact
 behavior. Instead they might prefer to implement the function such that an
 error code is returned instead of syncing the guest time to host time? In 
 that
 case S390 would need another bit KVM_CLOCK_SET_TO_HOST which we could call to
 sync host time when the preferred guest time value would otherwise violate
 the monotonic property of the KVM_CLOCK_FORWARD_ONLY flag.

 Signed-off-by: Jason J. Herne jjhe...@us.ibm.com
 ---
  Documentation/virtual/kvm/api.txt |  5 +++
  arch/s390/kvm/kvm-s390.c  | 80 
 +++
  include/uapi/linux/kvm.h  |  3 ++
  3 files changed, 88 insertions(+)

 diff --git a/Documentation/virtual/kvm/api.txt 
 b/Documentation/virtual/kvm/api.txt
 index beae3fd..615c2e4 100644
 --- a/Documentation/virtual/kvm/api.txt
 +++ b/Documentation/virtual/kvm/api.txt
 @@ -779,6 +779,11 @@ struct kvm_clock_data {
  __u32 pad[9];
  };

 +S390: KVM_CLOCK_FORWARD_ONLY is used by KVM_SET_CLOCK to indicate that the 
 guest
 +TOD clock should not be allowed to jump back in time. This flag guarantees a
 +monotonically increasing guest clock. If the clock value specified would 
 cause
 +the guest to jump back in time then the guest TOD clock is set to the host
 +TOD clock value.

  4.31 KVM_GET_VCPU_EVENTS

 diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
 index 81b0e11..2450db3 100644
 --- a/arch/s390/kvm/kvm-s390.c
 +++ b/arch/s390/kvm/kvm-s390.c
 @@ -31,6 +31,7 @@
  #include asm/switch_to.h
  #include asm/facility.h
  #include asm/sclp.h
 +#includeasm/timex.h
  #include kvm-s390.h
  #include gaccess.h

 @@ -169,6 +170,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
 ext)
  case KVM_CAP_S390_IRQCHIP:
  case KVM_CAP_VM_ATTRIBUTES:
  case KVM_CAP_MP_STATE:
 +case KVM_CAP_ADJUST_CLOCK:
  r = 1;
  break;
  case KVM_CAP_NR_VCPUS:
 @@ -338,6 +340,63 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct 
 kvm_device_attr *attr)
  return ret;
  }

 +static int kvm_s390_get_guest_tod(struct kvm *kvm, struct kvm_clock_data 
 *user_clock)
 +{
 +u64 current_host_tod;
 +u64 epoch = 0;
 +struct kvm_vcpu *vcpu;
 +unsigned int vcpu_idx;
 +int r;
 +
 +/* All vcpu's epochs are in sync. Just Grab the 1st one */
 +kvm_for_each_vcpu(vcpu_idx, vcpu, kvm)
 +{
 +epoch = vcpu-arch.sie_block-epoch;
 +break;
 +}
 +
 +r = store_tod_clock(current_host_tod);
 +if (r)
 +return r;
 +
 +user_clock-clock = current_host_tod + epoch;
 +return 0;
 +}
 +
 +/*
 +Set the guest's effective TOD clock to the given value. The guest's
 +TOD clock is determined by the following formula: gtod = host_tod + epoch.
 +NOTE: Even though the epoch value is associated with a vcpu, there is only
 +one TOD clock and epoch value per guest.  All vcpu's epoch 

Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Andy Lutomirski
On Fri, Sep 19, 2014 at 1:21 PM, Nadav Amit nadav.a...@gmail.com wrote:

 On Sep 19, 2014, at 9:42 PM, Andy Lutomirski l...@amacapital.net wrote:

 On Fri, Sep 19, 2014 at 11:30 AM, Christopher Covington
 c...@codeaurora.org wrote:
 On 09/17/2014 10:50 PM, Andy Lutomirski wrote:
 Hi all-

 I would like to standardize on a very simple protocol by which a guest
 OS can obtain an RNG seed early in boot.

 The main design requirements are:

 - The interface should be very easy to use.  Linux, at least, will
 want to use it extremely early in boot as part of kernel ASLR.  This
 means that PCI and ACPI will not work.

 How do non-virtual systems get entropy this early? RDRAND/Padlock? Truerand?
 Could hypervisors and simulators simply make sure these work?


 If RDRAND is available, then Linux, at least, will use it.  The rest
 are too complicated for early use.  Linux on x86 plays some vaguely
 clever games with rdtsc and poking at the i8254 port.

 I think that these tricks are even less useful as a guest than they
 are on metal, and we can use paravirt mechanisms to make guest early
 boot rngs much stronger.

 Sorry for interrupting, as I understand the discussion tries to be generic.

 However, it sounds to me that at least for KVM, it is very easy just to 
 emulate the RDRAND instruction. The hypervisor would report to the guest that 
 RDRAND is supported in CPUID and the emulate the instruction when guest 
 executes it. KVM already traps guest #UD (which would occur if RDRAND 
 executed while it is not supported) - so this scheme wouldn’t introduce 
 additional overhead over RDMSR.

Because then guest user code will think that rdrand is there and will
try to use it, resulting in abysmal performance.

--Andy


 Nadav



-- 
Andy Lutomirski
AMA Capital Management, LLC
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Gleb Natapov
On Fri, Sep 19, 2014 at 11:20:49AM -0700, Andy Lutomirski wrote:
 [cc: Alok Kataria at VMware]
 
 On Fri, Sep 19, 2014 at 11:12 AM, Gleb Natapov g...@kernel.org wrote:
  On Fri, Sep 19, 2014 at 11:02:38AM -0700, Andy Lutomirski wrote:
  On Fri, Sep 19, 2014 at 10:49 AM, Gleb Natapov g...@kernel.org wrote:
   On Fri, Sep 19, 2014 at 10:18:37AM -0700, H. Peter Anvin wrote:
   On 09/19/2014 10:15 AM, Gleb Natapov wrote:
On Fri, Sep 19, 2014 at 10:08:20AM -0700, H. Peter Anvin wrote:
On 09/19/2014 09:53 AM, Gleb Natapov wrote:
On Fri, Sep 19, 2014 at 09:40:07AM -0700, H. Peter Anvin wrote:
On 09/19/2014 09:37 AM, Gleb Natapov wrote:
   
Linux detects what hypervior it runs on very early
   
Not anywhere close to early enough.  We're talking for uses like 
kASLR.
   
Still to early to do:
   
   h = cpuid(HYPERVIOR_SIGNATURE)
   if (h == KVMKVMKVM) {
  if (cpuid(kvm_features)  kvm_rnd)
 rdmsr(kvm_rnd)
   else (h == HyperV) {
  if (cpuid(hv_features)  hv_rnd)
rdmsr(hv_rnd)
   else (h == XenXenXen) {
  if (cpuid(xen_features)  xen_rnd)
rdmsr(xen_rnd)
  }
   
   
If we need to do chase loops, especially not so...
   
What loops exactly? As a non native English speaker I fail to 
understand
if your answer is yes or no ;)
   
  
   The above isn't actually the full algorithm used.
  
   What part of actually algorithm cannot be implemented? Loop that searches
   for KVM leaf in case KVM pretend to be HyperV (is this what you called
   chase loops?)? First of all there is no need to implement it, if KVM
   pretends to be HyperV use HyperV's way to obtain RNG, but what is the
   problem with the loop?
  
 
  It can be implemented, and I've done it.  But it's a mess.  Almost the
  very first thing we do in boot (even before decompressing the kernel)
  will be to scan a bunch of cpuid leaves looking for a hypervisor with
  an rng source that we can use for kASLR.  And we'll have to update
  that code and make it bigger every time another hypervisor adds
  exactly the same feature.
  IMO implementing this feature is in hypervisor's best interest, so the task
  of updating the code will scale by virtue of hypervisor's developers each
  adding it for hypervisor he cares about.
 
 I assume that you mean guest, not hypervisor.
 
Yes, I mean guest support for hypervisor he cares about.

 
 
  And then we have another copy of almost exactly the same code in the
  normal post-boot part of the kernel.
 
  We can certainly do this, but I'd much rather solve the problem once
  and let all of the hypervisors and guests opt in and immediately be
  compatible with each other.
 
   I forgot VMware because I do not see VMware people to be CCed. They may
   be even less excited about them being told _how_ this feature need to be
   implemented (e.g implement HyperV leafs for the feature detection). I
   do not want to and cannot speak for VMware, but my guess is that for
   them it would be much easier to add an else clause for VMware in above
   if then to coordinate with all hypervisor developers about MSR/cpuid
   details. And since this is security feature implementing it for Linux
   is in their best interest.
 
  Do you know any of them who should be cc'd?
 
  No, not anyone in particular. git log arch/x86/kernel/cpu/vmware.c may help.
 
  But VMware is an elephant in the room here. There are other hypervisors out 
  there.
  VirtualBox, bhyve...
 
 Exactly.  The amount of effort to get everything to be compatible with
 everything scales quadratically in the number of hypervisors, and the
 probability that some combination is broken also increases.
 
The effort is distributed equally among hypervisor developers. If they
want Linux to be more secure on their hypervisor they contribute guest
code. They do need to write hypervisor part anyway. On cpus with RDRAND
instruction this MSR is not even needed and some hypervisors may decide
that support for old cpus does not worth the effort. Unified interface
does not help if hypervisor does not implement it.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 2/2] KVM: x86: directly use kvm_make_request again

2014-09-19 Thread Radim Krčmář
2014-09-19 21:35+0800, Xiao Guangrong:
 On 09/19/2014 08:25 PM, Radim Krčmář wrote:
* Returns 1 to let __vcpu_run() continue the guest execution loop 
  without
* exiting to the userspace.  Otherwise, the value will be returned to 
  the
  @@ -6018,8 +6024,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
kvm_mmu_sync_roots(vcpu);
if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
  - ++vcpu-stat.tlb_flush;
  - kvm_x86_ops-tlb_flush(vcpu);
  + kvm_vcpu_flush_tlb(vcpu);
 
  NACK!
 
  Do not understand why you have to introduce a meaningful name
  here - it's used just inner a function, which can not help to
  improve a readability of the code at all.
  
  I prefer the new hunk
   - it makes the parent function simpler (not everyone wants to read how
 we do tlb flushes when looking at vcpu_enter_guest)
 
 Using one line instead of two lines does not simplify parent function much.

(Don't forget braces!)

There might come a patch that pushes the length above a readability
threshold.  With our development process, I think it is quite likely
that new function won't get created then;
and preventing this situation makes the function nicer now as well.

(Most of my thinking that is about cases that will never happen.)

   - the function is properly named
 
 kvm_x86_ops-tlb_flush(vcpu) is also a good hit to tell the reader it is
 doing tlb flush. :)

Yep.  (The surprise was leaked by KVM_REQ_TLB_FLUSH.)

It was more like safety check -- if we wanted a new function, it should
be called like that.

   - we do a similar thing with kvm_gen_kvmclock_update
 
 I understand this raw-bit-set style is largely used in current kvm code,
 however, it does not mean it's a best way do it. It may be turned off
 someday as it is be used in more and more places.
 
 Anyway, the meaningful name wrapping raw-bit-set is a right direction
 and let's keep this right direction.

Agreed, it would be nice to have an indirection that hides the
underlying request-mechanic from higher-level code.

(More below.)

  My issues with kvm_mmu_flush_tlb:
  
   - 'kvm_flush_remote_tlbs()' calls tlb request directly;
  our wrapper thus cannot be extended with features, which makes it a
  poor abstraction
 
 kvm_flush_remote_tlbs does not only set tlb request but also handles memory
 order and syncs the tlb state.
 
 I guess you wanted to say kvm_mmu_flush_tlb here, it is a API name and let
 it be easily used in other files. It's not worth committing a patch doing
 nothing except reverting the meaningful name.

(I really meant kvm_flush_remote_tlbs().)

When we change kvm_mmu_flush_tlb(), it doesn't get propagated to
remote TLB flushes = we might have a false sense of API and
the code is harder to work with because of that.

(I don't consider kvm_mmu_flush_tlb() a step in the right direction ...
 close, like all bugs.)

   - we don't do this for other requests
 
 See above.

(Below is here.)

Between half-new half-old and unmixed API, I'm leaning towards the
latter option ...
(My arguments for this are weak though; not enough experience.)

   - direct request isn't absolutely horrible to read and write
 (I totally agree that it is bad.)
   - we call one function 'kvm_mmu_flush_tlb()' and the second one
 'kvm_flush_remote_tlbs()' and I'd need to look why
 
 Yeah, this is why i suggested to rename kvm_mmu_flush_tlb since which 
 clarifies
 things better:
 - kvm_flush_remote_tlbs: flush tlb in all vcpus
 - kvm_vcpu_flush_tlb: only flush tlb on the vcpu specified by @vcpu.

(I am confused about mmu in names -- kvm_flush_remote_tlbs is shared
 through host.h, which is probably why it didn't get mmu.)

  Which is why just removing it solves more problems for me :)
 
 Thank you for raising this question and letting me know the patch's history. 
 :)

Thanks for the reply, I hope I have understood you correctly,
now just to find a person to write all the good code :)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread H. Peter Anvin
On 09/19/2014 01:46 PM, Andy Lutomirski wrote:

 However, it sounds to me that at least for KVM, it is very easy just to 
 emulate the RDRAND instruction. The hypervisor would report to the guest 
 that RDRAND is supported in CPUID and the emulate the instruction when guest 
 executes it. KVM already traps guest #UD (which would occur if RDRAND 
 executed while it is not supported) - so this scheme wouldn’t introduce 
 additional overhead over RDMSR.
 
 Because then guest user code will think that rdrand is there and will
 try to use it, resulting in abysmal performance.
 

Yes, the presence of RDRAND implies a cheap and inexhaustible entropy
source.

-hpa


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Theodore Ts'o
On Fri, Sep 19, 2014 at 09:40:42AM -0700, H. Peter Anvin wrote:
 
 There is a huge disadvantage to the fact that CPUID is a user space
 instruction, though.

But if the goal is to provide something like getrandom(2) direct from
the Host OS, it's not necessarily harmful to allow the Guest ring 3
code to be able to fetch randomness in that way.  The hypervisor can
implement rate limiting to protect against the guest using this too
frequently, but this is something that you should be doing for guest
ring 0 code anyway, since from the POV of the hypervisor Guest ring 0
is not necessarily any more trusted than Guest ring 3.

   - Ted
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Andy Lutomirski
On Fri, Sep 19, 2014 at 3:05 PM, Theodore Ts'o ty...@mit.edu wrote:
 On Fri, Sep 19, 2014 at 09:40:42AM -0700, H. Peter Anvin wrote:

 There is a huge disadvantage to the fact that CPUID is a user space
 instruction, though.

 But if the goal is to provide something like getrandom(2) direct from
 the Host OS, it's not necessarily harmful to allow the Guest ring 3
 code to be able to fetch randomness in that way.  The hypervisor can
 implement rate limiting to protect against the guest using this too
 frequently, but this is something that you should be doing for guest
 ring 0 code anyway, since from the POV of the hypervisor Guest ring 0
 is not necessarily any more trusted than Guest ring 3.

On the other hand, the guest kernel might not want the guest ring 3 to
be able to get random numbers.

--Andy
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Nakajima, Jun
On Fri, Sep 19, 2014 at 3:06 PM, Andy Lutomirski l...@amacapital.net wrote:
 On Fri, Sep 19, 2014 at 3:05 PM, Theodore Ts'o ty...@mit.edu wrote:
 On Fri, Sep 19, 2014 at 09:40:42AM -0700, H. Peter Anvin wrote:

 There is a huge disadvantage to the fact that CPUID is a user space
 instruction, though.

 But if the goal is to provide something like getrandom(2) direct from
 the Host OS, it's not necessarily harmful to allow the Guest ring 3
 code to be able to fetch randomness in that way.  The hypervisor can
 implement rate limiting to protect against the guest using this too
 frequently, but this is something that you should be doing for guest
 ring 0 code anyway, since from the POV of the hypervisor Guest ring 0
 is not necessarily any more trusted than Guest ring 3.

 On the other hand, the guest kernel might not want the guest ring 3 to
 be able to get random numbers.


But the RDSEED instruction, for example, is available in user-level.
And I'm not sure that the kernel can do something with that.

-- 
Jun
Intel Open Source Technology Center
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Theodore Ts'o
On Fri, Sep 19, 2014 at 03:06:55PM -0700, Andy Lutomirski wrote:
 On Fri, Sep 19, 2014 at 3:05 PM, Theodore Ts'o ty...@mit.edu wrote:
  On Fri, Sep 19, 2014 at 09:40:42AM -0700, H. Peter Anvin wrote:
 
  There is a huge disadvantage to the fact that CPUID is a user space
  instruction, though.
 
  But if the goal is to provide something like getrandom(2) direct from
  the Host OS, it's not necessarily harmful to allow the Guest ring 3
  code to be able to fetch randomness in that way.  The hypervisor can
  implement rate limiting to protect against the guest using this too
  frequently, but this is something that you should be doing for guest
  ring 0 code anyway, since from the POV of the hypervisor Guest ring 0
  is not necessarily any more trusted than Guest ring 3.
 
 On the other hand, the guest kernel might not want the guest ring 3 to
 be able to get random numbers.

Um, why?

We're talking about using this to seed the RNG, and not something that
the guest kernel would be using continuously.  So what's the problem
with letting the guest ring get random numbers from the host?

   - Ted
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kvm: don't take vcpu mutex for obviously invalid vcpu ioctls

2014-09-19 Thread David Matlack
vcpu ioctls can hang the calling thread if issued while a vcpu is
running. If we know ioctl is going to be rejected as invalid anyway,
we can fail before trying to take the vcpu mutex.

This patch does not change functionality, it just makes invalid ioctls
fail faster.

Signed-off-by: David Matlack dmatl...@google.com
---
 virt/kvm/kvm_main.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 96ec622..f9234e5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -52,6 +52,7 @@
 
 #include asm/processor.h
 #include asm/io.h
+#include asm/ioctl.h
 #include asm/uaccess.h
 #include asm/pgtable.h
 
@@ -1975,6 +1976,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
if (vcpu-kvm-mm != current-mm)
return -EIO;
 
+   if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
+   return -EINVAL;
+
 #if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
/*
 * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
-- 
2.1.0.rc2.206.gedb03e5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Andy Lutomirski
On Fri, Sep 19, 2014 at 3:57 PM, Theodore Ts'o ty...@mit.edu wrote:
 On Fri, Sep 19, 2014 at 03:06:55PM -0700, Andy Lutomirski wrote:
 On Fri, Sep 19, 2014 at 3:05 PM, Theodore Ts'o ty...@mit.edu wrote:
  On Fri, Sep 19, 2014 at 09:40:42AM -0700, H. Peter Anvin wrote:
 
  There is a huge disadvantage to the fact that CPUID is a user space
  instruction, though.
 
  But if the goal is to provide something like getrandom(2) direct from
  the Host OS, it's not necessarily harmful to allow the Guest ring 3
  code to be able to fetch randomness in that way.  The hypervisor can
  implement rate limiting to protect against the guest using this too
  frequently, but this is something that you should be doing for guest
  ring 0 code anyway, since from the POV of the hypervisor Guest ring 0
  is not necessarily any more trusted than Guest ring 3.

 On the other hand, the guest kernel might not want the guest ring 3 to
 be able to get random numbers.

 Um, why?

To force deterministic execution.

I incorrectly thought that the kernel could switch RDRAND on and off.
It turns out that a hypervisor can do this, but not the kernel.  Also,
determinism is lost anyway because of TSX, which *also* can't be
turned on and off.


 We're talking about using this to seed the RNG, and not something that
 the guest kernel would be using continuously.  So what's the problem
 with letting the guest ring get random numbers from the host?

I object to preventing guest kernels from limiting the privileges of
their own userspace.  Letting guest CPL3 do this is essentially
setting guest policy in the hypervisor, which I dislike if we can
avoid it.

Admittedly, in this case, control of RNG availability in guest
userspace may be a lost cause regardless.

--Andy
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread H. Peter Anvin
On 09/19/2014 04:12 PM, Andy Lutomirski wrote:
 
 To force deterministic execution.
 
 I incorrectly thought that the kernel could switch RDRAND on and off.
 It turns out that a hypervisor can do this, but not the kernel.  Also,
 determinism is lost anyway because of TSX, which *also* can't be
 turned on and off.
 

Actually, a much bigger reason is because it lets rogue guest *user
space*, even will a well-behaved guest OS, do something potentially
harmful to the host.

-hpa


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread H. Peter Anvin
On 09/19/2014 04:12 PM, Andy Lutomirski wrote:
 
 To force deterministic execution.
 
 I incorrectly thought that the kernel could switch RDRAND on and off.
 It turns out that a hypervisor can do this, but not the kernel.  Also,
 determinism is lost anyway because of TSX, which *also* can't be
 turned on and off.
 

Actually, a much bigger reason is because it lets rogue guest *user
space*, even will a well-behaved guest OS, do something potentially
harmful to the host.

-hpa


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Theodore Ts'o
On Fri, Sep 19, 2014 at 04:29:53PM -0700, H. Peter Anvin wrote:
 
 Actually, a much bigger reason is because it lets rogue guest *user
 space*, even will a well-behaved guest OS, do something potentially
 harmful to the host.

Right, but if the host kernel is dependent on the guest OS for
security, the game is over.  The Guest Kernel must NEVER been able to
do anything harmful to the host.  If it can, it is a severe security
bug in KVM that must be fixed ASAP.

- Ted
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread Andy Lutomirski
On Fri, Sep 19, 2014 at 4:35 PM, Theodore Ts'o ty...@mit.edu wrote:
 On Fri, Sep 19, 2014 at 04:29:53PM -0700, H. Peter Anvin wrote:

 Actually, a much bigger reason is because it lets rogue guest *user
 space*, even will a well-behaved guest OS, do something potentially
 harmful to the host.

 Right, but if the host kernel is dependent on the guest OS for
 security, the game is over.  The Guest Kernel must NEVER been able to
 do anything harmful to the host.  If it can, it is a severe security
 bug in KVM that must be fixed ASAP.

Nonetheless, I suspect that some OS kernel author, somewhere, will
object to having a hypervisor that exposes new capabilities to guest
CPL 3 without requiring the guest to opt in, if for no other reason
than that it slightly increases the attack surface.

I certainly object on these grounds.

--Andy
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm: Make init_rmode_tss() return 0 on success.

2014-09-19 Thread Radim Krčmář
2014-09-16 13:38+0200, Paolo Bonzini:
 In init_rmode_tss(), there two variables indicating the return
 value, r and ret, and it return 0 on error, 1 on success. The function
 is only called by vmx_set_tss_addr(), and r is redundant.
 
 This patch removes the redundant variable, by making init_rmode_tss()
 return 0 on success, -errno on failure.

Which is going to propagate all the way to userpace through ioctl ...
is this change of A[PB]I acceptable?

Otherwise, -EFAULT seems to match unlikely problems better than -ENOMEM,
so if it is acceptable:  Reviewed-by: Radim Krčmář rkrc...@redhat.com
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standardizing an MSR or other hypercall to get an RNG seed?

2014-09-19 Thread H. Peter Anvin
On 09/19/2014 04:35 PM, Theodore Ts'o wrote:
 On Fri, Sep 19, 2014 at 04:29:53PM -0700, H. Peter Anvin wrote:

 Actually, a much bigger reason is because it lets rogue guest *user
 space*, even will a well-behaved guest OS, do something potentially
 harmful to the host.
 
 Right, but if the host kernel is dependent on the guest OS for
 security, the game is over.  The Guest Kernel must NEVER been able to
 do anything harmful to the host.  If it can, it is a severe security
 bug in KVM that must be fixed ASAP.
 

Security and resource well-behaved are two different things.

-hpa


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 15/15] arm/arm64: KVM: allow userland to request a virtual GICv3

2014-09-19 Thread wanghaibin
On 2014/8/21 21:06, Andre Przywara wrote:

 With everything in place we allow userland to request the kernel
 using a virtual GICv3 in the guest, which finally lifts the 8 vCPU
 limit for a guest.
 Also we provide the necessary support for guests setting the memory
 addresses for the virtual distributor and redistributors.
 This requires some userland code to make use of that feature and
 explicitly ask for a virtual GICv3.
 
 Signed-off-by: Andre Przywara andre.przyw...@arm.com
 ---
  arch/arm64/include/uapi/asm/kvm.h |7 ++
  include/kvm/arm_vgic.h|4 ++--
  virt/kvm/arm/vgic-v3-emul.c   |3 +++
  virt/kvm/arm/vgic.c   |   46 
 ++---
  4 files changed, 45 insertions(+), 15 deletions(-)
 
 diff --git a/arch/arm64/include/uapi/asm/kvm.h 
 b/arch/arm64/include/uapi/asm/kvm.h
 index 5513de4..9a62081 100644
 --- a/arch/arm64/include/uapi/asm/kvm.h
 +++ b/arch/arm64/include/uapi/asm/kvm.h
 @@ -77,6 +77,13 @@ struct kvm_regs {
  #define KVM_VGIC_V2_DIST_SIZE0x1000
  #define KVM_VGIC_V2_CPU_SIZE 0x2000
  
 +/* Supported VGICv3 address types  */
 +#define KVM_VGIC_V3_ADDR_TYPE_DIST   2
 +#define KVM_VGIC_V3_ADDR_TYPE_REDIST 3
 +
 +#define KVM_VGIC_V3_DIST_SIZESZ_64K
 +#define KVM_VGIC_V3_REDIST_SIZE  (2 * SZ_64K)
 +
  #define KVM_ARM_VCPU_POWER_OFF   0 /* CPU is started in OFF 
 state */
  #define KVM_ARM_VCPU_EL1_32BIT   1 /* CPU running a 32bit VM */
  #define KVM_ARM_VCPU_PSCI_0_22 /* CPU uses PSCI v0.2 */
 diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
 index 3b164ee..82e00a5 100644
 --- a/include/kvm/arm_vgic.h
 +++ b/include/kvm/arm_vgic.h
 @@ -35,8 +35,8 @@
  #define VGIC_MAX_IRQS1024
  
  /* Sanity checks... */
 -#if (KVM_MAX_VCPUS  8)
 -#error   Invalid number of CPU interfaces
 +#if (KVM_MAX_VCPUS  255)
 +#error Too many KVM VCPUs, the VGIC only supports up to 255 VCPUs for now
  #endif
  
  #if (VGIC_NR_IRQS_LEGACY  31)
 diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c
 index ac5c5ee..d4d9fe4 100644
 --- a/virt/kvm/arm/vgic-v3-emul.c
 +++ b/virt/kvm/arm/vgic-v3-emul.c
 @@ -874,6 +874,9 @@ static int vgic_v3_has_attr(struct kvm_device *dev,
   case KVM_VGIC_V2_ADDR_TYPE_DIST:
   case KVM_VGIC_V2_ADDR_TYPE_CPU:
   return -ENXIO;
 + case KVM_VGIC_V3_ADDR_TYPE_DIST:
 + case KVM_VGIC_V3_ADDR_TYPE_REDIST:
 + return 0;
   }
   break;
   case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
 diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
 index b1bcb6d..f13ab4e 100644
 --- a/virt/kvm/arm/vgic.c
 +++ b/virt/kvm/arm/vgic.c
 @@ -1567,7 +1567,7 @@ static int vgic_ioaddr_assign(struct kvm *kvm, 
 phys_addr_t *ioaddr,
  /**
   * kvm_vgic_addr - set or get vgic VM base addresses
   * @kvm:   pointer to the vm struct
 - * @type:  the VGIC addr type, one of KVM_VGIC_V2_ADDR_TYPE_XXX
 + * @type:  the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX
   * @addr:  pointer to address value
   * @write: if true set the address in the VM address space, if false read the
   *  address
 @@ -1581,29 +1581,49 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long 
 type, u64 *addr, bool write)
  {
   int r = 0;
   struct vgic_dist *vgic = kvm-arch.vgic;
 + int type_needed;
 + phys_addr_t *addr_ptr, block_size;
  
   mutex_lock(kvm-lock);
   switch (type) {
   case KVM_VGIC_V2_ADDR_TYPE_DIST:
 - if (write) {
 - r = vgic_ioaddr_assign(kvm, vgic-vgic_dist_base,
 -*addr, KVM_VGIC_V2_DIST_SIZE);
 - } else {
 - *addr = vgic-vgic_dist_base;
 - }
 + type_needed = KVM_DEV_TYPE_ARM_VGIC_V2;
 + addr_ptr = vgic-vgic_dist_base;
 + block_size = KVM_VGIC_V2_DIST_SIZE;
   break;
   case KVM_VGIC_V2_ADDR_TYPE_CPU:
 - if (write) {
 - r = vgic_ioaddr_assign(kvm, vgic-vgic_cpu_base,
 -*addr, KVM_VGIC_V2_CPU_SIZE);
 - } else {
 - *addr = vgic-vgic_cpu_base;
 - }
 + type_needed = KVM_DEV_TYPE_ARM_VGIC_V2;
 + addr_ptr = vgic-vgic_cpu_base;
 + block_size = KVM_VGIC_V2_CPU_SIZE;
   break;
 +#ifdef CONFIG_ARM_GIC_V3
 + case KVM_VGIC_V3_ADDR_TYPE_DIST:
 + type_needed = KVM_DEV_TYPE_ARM_VGIC_V3;
 + addr_ptr = vgic-vgic_dist_base;
 + block_size = KVM_VGIC_V3_DIST_SIZE;
 + break;
 + case KVM_VGIC_V3_ADDR_TYPE_REDIST:


Hi, Andre, here is the redist region base address, is it?
So, I think the block_size should be (online_vcpus * KVM_VGIC_V3_REDIST_SIZE).

 + type_needed = KVM_DEV_TYPE_ARM_VGIC_V3;
 +