[PATCH RESEND v15 10/10] KVM: arm/arm64: Enable Dirty Page logging for ARMv8

2015-01-08 Thread Mario Smarduch
This patch enables ARMv8 dirty page logging support. Plugs ARMv8 into generic
layer through Kconfig symbol, and drops earlier ARM64 constraints to enable
logging at architecture layer. I applies cleanly on top patches series
posted Dec 15:
https://lists.cs.columbia.edu/pipermail/kvmarm/2014-December/012826.html

Patch number #11 of the series has be dropped.

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---

Change Log:
- removed inline definition from kvm_flush_remote_tlbs() 
---
 arch/arm/include/asm/kvm_host.h |   12 
 arch/arm/kvm/arm.c  |4 
 arch/arm/kvm/mmu.c  |   19 +++
 arch/arm64/kvm/Kconfig  |2 ++
 4 files changed, 13 insertions(+), 24 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index b138431..088ea87 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -223,18 +223,6 @@ static inline void __cpu_init_hyp_mode(phys_addr_t 
boot_pgd_ptr,
kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr);
 }
 
-/**
- * kvm_flush_remote_tlbs() - flush all VM TLB entries
- * @kvm:   pointer to kvm structure.
- *
- * Interface to HYP function to flush all VM TLB entries without address
- * parameter.
- */
-static inline void kvm_flush_remote_tlbs(struct kvm *kvm)
-{
-   kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
-}
-
 static inline int kvm_arch_dev_ioctl_check_extension(long ext)
 {
return 0;
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 6e4290c..1b6577c 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -740,7 +740,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
  */
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 {
-#ifdef CONFIG_ARM
bool is_dirty = false;
int r;
 
@@ -753,9 +752,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct 
kvm_dirty_log *log)
 
mutex_unlock(kvm-slots_lock);
return r;
-#else /* arm64 */
-   return -EINVAL;
-#endif
 }
 
 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 2bfe22d..1384743 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -52,11 +52,18 @@ static phys_addr_t hyp_idmap_vector;
 
 static bool memslot_is_logging(struct kvm_memory_slot *memslot)
 {
-#ifdef CONFIG_ARM
return !!memslot-dirty_bitmap;
-#else
-   return false;
-#endif
+}
+
+/**
+ * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
+ * @kvm:   pointer to kvm structure.
+ *
+ * Interface to HYP function to flush all VM TLB entries
+ */
+void kvm_flush_remote_tlbs(struct kvm *kvm)
+{
+   kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
 }
 
 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
@@ -883,7 +890,6 @@ static bool kvm_is_device_pfn(unsigned long pfn)
return !pfn_valid(pfn);
 }
 
-#ifdef CONFIG_ARM
 /**
  * stage2_wp_ptes - write protect PMD range
  * @pmd:   pointer to pmd entry
@@ -1028,7 +1034,6 @@ void kvm_arch_mmu_write_protect_pt_masked(struct kvm *kvm,
 
stage2_wp_range(kvm, start, end);
 }
-#endif
 
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
  struct kvm_memory_slot *memslot, unsigned long hva,
@@ -1457,7 +1462,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
   const struct kvm_memory_slot *old,
   enum kvm_mr_change change)
 {
-#ifdef CONFIG_ARM
/*
 * At this point memslot has been committed and there is an
 * allocated dirty_bitmap[], dirty pages will be be tracked while the
@@ -1465,7 +1469,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 */
if (change != KVM_MR_DELETE  mem-flags  KVM_MEM_LOG_DIRTY_PAGES)
kvm_mmu_wp_memory_region(kvm, mem-slot);
-#endif
 }
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 8ba85e9..3ce389b 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -22,10 +22,12 @@ config KVM
select PREEMPT_NOTIFIERS
select ANON_INODES
select HAVE_KVM_CPU_RELAX_INTERCEPT
+   select HAVE_KVM_ARCH_TLB_FLUSH_ALL
select KVM_MMIO
select KVM_ARM_HOST
select KVM_ARM_VGIC
select KVM_ARM_TIMER
+   select KVM_GENERIC_DIRTYLOG_READ_PROTECT
---help---
  Support hosting virtualized guest machines.
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/1] KVM: ioapic: Record edge-triggered interrupts delivery status.

2015-01-08 Thread Wincy Van
Ping..

Hi, Paolo, could you please have a look at this patch ?


Thanks,

Wincy
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RESEND v15 07/10] KVM: arm: page logging 2nd stage fault handling

2015-01-08 Thread Mario Smarduch
This patch adds support for handling 2nd stage page faults during migration,
it disables faulting in huge pages, and dissolves huge pages to normal pages.
In case migration is canceled huge pages are used again, if memory conditions
permit it. I applies cleanly on top patches series posted Dec 15:
https://lists.cs.columbia.edu/pipermail/kvmarm/2014-December/012826.html

Patch number #11 of the series has be dropped.

Signed-off-by: Mario Smarduch m.smard...@samsung.com
---

Change Log since last RESEND:
- fixed bug exposed __get_user_pages_fast(), when region is writable prevent
  write protection of pte so we can handle a future write fault and mark page
  dirty.
- Removed marking entire huge page dirty on initial dirty log read.
- don't dissolve non-writable huge pages
- Made updates based on Christoffers comments
  - renamed logging status function to memslot_is_logging()
  - changes few values to bool from
  - streamlined user_mem_abort() to eliminate extra conditional checks
 
---
 arch/arm/kvm/mmu.c |   92 +---
 1 file changed, 87 insertions(+), 5 deletions(-)

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 73d506f..2bfe22d 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -47,6 +47,18 @@ static phys_addr_t hyp_idmap_vector;
 #define kvm_pmd_huge(_x)   (pmd_huge(_x) || pmd_trans_huge(_x))
 #define kvm_pud_huge(_x)   pud_huge(_x)
 
+#define KVM_S2PTE_FLAG_IS_IOMAP(1UL  0)
+#define KVM_S2PTE_FLAG_LOGGING_ACTIVE  (1UL  1)
+
+static bool memslot_is_logging(struct kvm_memory_slot *memslot)
+{
+#ifdef CONFIG_ARM
+   return !!memslot-dirty_bitmap;
+#else
+   return false;
+#endif
+}
+
 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 {
/*
@@ -59,6 +71,25 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, 
phys_addr_t ipa)
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
 }
 
+/**
+ * stage2_dissolve_pmd() - clear and flush huge PMD entry
+ * @kvm:   pointer to kvm structure.
+ * @addr:  IPA
+ * @pmd:   pmd pointer for IPA
+ *
+ * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
+ * pages in the range dirty.
+ */
+static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
+{
+   if (!kvm_pmd_huge(*pmd))
+   return;
+
+   pmd_clear(pmd);
+   kvm_tlb_flush_vmid_ipa(kvm, addr);
+   put_page(virt_to_page(pmd));
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
  int min, int max)
 {
@@ -703,10 +734,13 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct 
kvm_mmu_memory_cache
 }
 
 static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr, const pte_t *new_pte, bool iomap)
+ phys_addr_t addr, const pte_t *new_pte,
+ unsigned long flags)
 {
pmd_t *pmd;
pte_t *pte, old_pte;
+   bool iomap = flags  KVM_S2PTE_FLAG_IS_IOMAP;
+   bool logging_active = flags  KVM_S2PTE_FLAG_LOGGING_ACTIVE;
 
/* Create stage-2 page table mapping - Levels 0 and 1 */
pmd = stage2_get_pmd(kvm, cache, addr);
@@ -718,6 +752,13 @@ static int stage2_set_pte(struct kvm *kvm, struct 
kvm_mmu_memory_cache *cache,
return 0;
}
 
+   /*
+* While dirty page logging - dissolve huge PMD, then continue on to
+* allocate page.
+*/
+   if (logging_active)
+   stage2_dissolve_pmd(kvm, addr, pmd);
+
/* Create stage-2 page mappings - Level 2 */
if (pmd_none(*pmd)) {
if (!cache)
@@ -774,7 +815,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t 
guest_ipa,
if (ret)
goto out;
spin_lock(kvm-mmu_lock);
-   ret = stage2_set_pte(kvm, cache, addr, pte, true);
+   ret = stage2_set_pte(kvm, cache, addr, pte,
+   KVM_S2PTE_FLAG_IS_IOMAP);
spin_unlock(kvm-mmu_lock);
if (ret)
goto out;
@@ -1002,6 +1044,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
pfn_t pfn;
pgprot_t mem_type = PAGE_S2;
bool fault_ipa_uncached;
+   bool can_set_pte_rw = true;
+   unsigned long set_pte_flags = 0;
 
write_fault = kvm_is_write_fault(vcpu);
if (fault_status == FSC_PERM  !write_fault) {
@@ -1009,6 +1053,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
return -EFAULT;
}
 
+
/* Let's check if we will get back a huge page backed by hugetlbfs */
down_read(current-mm-mmap_sem);
vma = find_vma_intersection(current-mm, hva, hva + 1);
@@ -1065,6 +1110,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t 

Re: [patch 2/3] KVM: x86: add option to advance tscdeadline hrtimer expiration

2015-01-08 Thread Paolo Bonzini


On 08/01/2015 18:41, Marcelo Tosatti wrote:
 Paolo?
 
  And cover letter is a bit misleading:  The condition does nothing to
  guarantee TSC based __delay() loop.  (Right now, __delay() = delay_tsc()
  whenever the hardware has TSC, regardless of stability, thus always.)
 
 OK.

Yes, because of this it is unnecessary to check for stable TSC.  I'll
remove that.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 2/2] x86, vdso, pvclock: Simplify and speed up the vdso pvclock reader

2015-01-08 Thread Marcelo Tosatti
On Tue, Jan 06, 2015 at 11:49:09AM -0800, Andy Lutomirski wrote:
 On Tue, Jan 6, 2015 at 10:45 AM, Marcelo Tosatti mtosa...@redhat.com wrote:
  On Tue, Jan 06, 2015 at 10:26:22AM -0800, Andy Lutomirski wrote:
  On Tue, Jan 6, 2015 at 10:13 AM, Marcelo Tosatti mtosa...@redhat.com 
  wrote:
   On Tue, Jan 06, 2015 at 08:56:40AM -0800, Andy Lutomirski wrote:
   On Jan 6, 2015 4:01 AM, Paolo Bonzini pbonz...@redhat.com wrote:
   
   
   
On 06/01/2015 09:42, Paolo Bonzini wrote:
   Still confused.  So we can freeze all vCPUs in the host, then 
   update
   pvti 1, then resume vCPU 1, then update pvti 0?  In that case, 
   we have
   a problem, because vCPU 1 can observe pvti 0 mid-update, and KVM
   doesn't increment the version pre-update, and we can return 
   completely
   bogus results.
  Yes.
 But then the getcpu test would fail (1-0).  Even if you have an ABA
 situation (1-0-1), it's okay because the pvti that is fetched is 
 the
 one returned by the first getcpu.
   
... this case of partial update of pvti, which is caught by the 
version
field, if of course different from the other (extremely unlikely) that
Andy pointed out.  That is when the getcpus are done on the same vCPU,
but the rdtsc is another.
   
That one can be fixed by rdtscp, like
   
do {
// get a consistent (pvti, v, tsc) tuple
do {
cpu = get_cpu();
pvti = get_pvti(cpu);
v = pvti-version  ~1;
// also acts as rmb();
rdtsc_barrier();
tsc = rdtscp(cpu1);
  
   Off-topic note: rdtscp doesn't need a barrier at all.  AIUI AMD
   specified it that way and both AMD and Intel implement it correctly.
   (rdtsc, on the other hand, definitely needs the barrier beforehand.)
  
// control dependency, no need for rdtsc_barrier?
} while(cpu != cpu1);
   
// ... compute nanoseconds from pvti and tsc ...
rmb();
}   while(v != pvti-version);
  
   Still no good.  We can migrate a bunch of times so we see the same CPU
   all three times and *still* don't get a consistent read, unless we
   play nasty games with lots of version checks (I have a patch for that,
   but I don't like it very much).  The patch is here:
  
   https://git.kernel.org/cgit/linux/kernel/git/luto/linux.git/commit/?h=x86/vdso_paranoiaid=a69754dc5ff33f5187162b5338854ad23dd7be8d
  
   but I don't like it.
  
   Thus far, I've been told unambiguously that a guest can't observe pvti
   while it's being written, and I think you're now telling me that this
   isn't true and that a guest *can* observe pvti while it's being
   written while the low bit of the version field is not set.  If so,
   this is rather strongly incompatible with the spec in the KVM docs.
  
   I don't suppose that you and Marcelo could agree on what the actual
   semantics that KVM provides are and could write it down in a way that
   people who haven't spent a long time staring at the request code
   understand?  And maybe you could even fix the implementation while
   you're at it if the implementation is, indeed, broken.  I have ugly
   patches to fix it here:
  
   https://git.kernel.org/cgit/linux/kernel/git/luto/linux.git/commit/?h=x86/vdso_paranoiaid=3b718a050cba52563d831febc2e1ca184c02bac0
  
   but I'm not thrilled with them.
  
   --Andy
  
   I suppose that separating the version write from the rest of the pvclock
   structure is sufficient, as that would guarantee the writes are not
   reordered even with fast string REP MOVS.
  
   Thanks for catching this Andy!
  
 
  Don't you stil need:
 
  version++;
  write the rest;
  version++;
 
  with possible smp_wmb() in there to keep the compiler from messing around?
 
  Correct. Could just as well follow the protocol and use odd/even, which
  is what your patch does.
 
  What is the point with the new flags bit though?
 
 To try to work around the problem on old hosts.  I'm not at all
 convinced that this is worthwhile or that it helps, though.

Andy, 

Are you going to submit the fix or should i? 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 2/2] x86, vdso, pvclock: Simplify and speed up the vdso pvclock reader

2015-01-08 Thread Andy Lutomirski
On Thu, Jan 8, 2015 at 2:31 PM, Marcelo Tosatti mtosa...@redhat.com wrote:
 On Tue, Jan 06, 2015 at 11:49:09AM -0800, Andy Lutomirski wrote:
 On Tue, Jan 6, 2015 at 10:45 AM, Marcelo Tosatti mtosa...@redhat.com wrote:
  On Tue, Jan 06, 2015 at 10:26:22AM -0800, Andy Lutomirski wrote:
  On Tue, Jan 6, 2015 at 10:13 AM, Marcelo Tosatti mtosa...@redhat.com 
  wrote:
   On Tue, Jan 06, 2015 at 08:56:40AM -0800, Andy Lutomirski wrote:
   On Jan 6, 2015 4:01 AM, Paolo Bonzini pbonz...@redhat.com wrote:
   
   
   
On 06/01/2015 09:42, Paolo Bonzini wrote:
   Still confused.  So we can freeze all vCPUs in the host, then 
   update
   pvti 1, then resume vCPU 1, then update pvti 0?  In that case, 
   we have
   a problem, because vCPU 1 can observe pvti 0 mid-update, and 
   KVM
   doesn't increment the version pre-update, and we can return 
   completely
   bogus results.
  Yes.
 But then the getcpu test would fail (1-0).  Even if you have an 
 ABA
 situation (1-0-1), it's okay because the pvti that is fetched is 
 the
 one returned by the first getcpu.
   
... this case of partial update of pvti, which is caught by the 
version
field, if of course different from the other (extremely unlikely) 
that
Andy pointed out.  That is when the getcpus are done on the same 
vCPU,
but the rdtsc is another.
   
That one can be fixed by rdtscp, like
   
do {
// get a consistent (pvti, v, tsc) tuple
do {
cpu = get_cpu();
pvti = get_pvti(cpu);
v = pvti-version  ~1;
// also acts as rmb();
rdtsc_barrier();
tsc = rdtscp(cpu1);
  
   Off-topic note: rdtscp doesn't need a barrier at all.  AIUI AMD
   specified it that way and both AMD and Intel implement it correctly.
   (rdtsc, on the other hand, definitely needs the barrier beforehand.)
  
// control dependency, no need for rdtsc_barrier?
} while(cpu != cpu1);
   
// ... compute nanoseconds from pvti and tsc ...
rmb();
}   while(v != pvti-version);
  
   Still no good.  We can migrate a bunch of times so we see the same CPU
   all three times and *still* don't get a consistent read, unless we
   play nasty games with lots of version checks (I have a patch for that,
   but I don't like it very much).  The patch is here:
  
   https://git.kernel.org/cgit/linux/kernel/git/luto/linux.git/commit/?h=x86/vdso_paranoiaid=a69754dc5ff33f5187162b5338854ad23dd7be8d
  
   but I don't like it.
  
   Thus far, I've been told unambiguously that a guest can't observe pvti
   while it's being written, and I think you're now telling me that this
   isn't true and that a guest *can* observe pvti while it's being
   written while the low bit of the version field is not set.  If so,
   this is rather strongly incompatible with the spec in the KVM docs.
  
   I don't suppose that you and Marcelo could agree on what the actual
   semantics that KVM provides are and could write it down in a way that
   people who haven't spent a long time staring at the request code
   understand?  And maybe you could even fix the implementation while
   you're at it if the implementation is, indeed, broken.  I have ugly
   patches to fix it here:
  
   https://git.kernel.org/cgit/linux/kernel/git/luto/linux.git/commit/?h=x86/vdso_paranoiaid=3b718a050cba52563d831febc2e1ca184c02bac0
  
   but I'm not thrilled with them.
  
   --Andy
  
   I suppose that separating the version write from the rest of the pvclock
   structure is sufficient, as that would guarantee the writes are not
   reordered even with fast string REP MOVS.
  
   Thanks for catching this Andy!
  
 
  Don't you stil need:
 
  version++;
  write the rest;
  version++;
 
  with possible smp_wmb() in there to keep the compiler from messing around?
 
  Correct. Could just as well follow the protocol and use odd/even, which
  is what your patch does.
 
  What is the point with the new flags bit though?

 To try to work around the problem on old hosts.  I'm not at all
 convinced that this is worthwhile or that it helps, though.

 Andy,

 Are you going to submit the fix or should i?


I'd prefer if you did it.  I'm not familiar enough with the KVM memory
management stuff to do it confidently.  Feel free to mooch from my
patch if it's helpful.

--Andy

-- 
Andy Lutomirski
AMA Capital Management, LLC
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v15 11/11] KVM: arm/arm64: Add support to dissolve huge PUD

2015-01-08 Thread Christoffer Dall
On Wed, Jan 07, 2015 at 07:01:10PM -0800, Mario Smarduch wrote:
 On 01/07/2015 05:05 AM, Christoffer Dall wrote:
  On Sun, Dec 14, 2014 at 11:28:08PM -0800, Mario Smarduch wrote:
  This patch adds the same support for PUD huge page as for PMD. Huge PUD is 
  write protected for initial memory region write protection. Code to 
  dissolve 
  huge PUD is supported in user_mem_abort(). At this time this code has not 
  been 
  tested, but similar approach to current ARMv8 page logging test is in work,
  limiting kernel memory and mapping in 1 or 2GB into Guest address space on 
  a 
  4k page/48 bit host, some host kernel test code needs to be added to detect
  page fault to this region and side step general processing. Also similar 
  to 
  PMD case all pages in range are marked dirty when PUD entry is cleared.
  
  the note about this code being untested shouldn't be part of the commit
  message but after the '---' separater or in the cover letter I think.
 
 Ah ok.
  
 
  Signed-off-by: Mario Smarduch m.smard...@samsung.com
  ---
   arch/arm/include/asm/kvm_mmu.h |  8 +
   arch/arm/kvm/mmu.c | 64 
  --
   arch/arm64/include/asm/kvm_mmu.h   |  9 +
   arch/arm64/include/asm/pgtable-hwdef.h |  3 ++
   4 files changed, 81 insertions(+), 3 deletions(-)
 
  diff --git a/arch/arm/include/asm/kvm_mmu.h 
  b/arch/arm/include/asm/kvm_mmu.h
  index dda0046..703d04d 100644
  --- a/arch/arm/include/asm/kvm_mmu.h
  +++ b/arch/arm/include/asm/kvm_mmu.h
  @@ -133,6 +133,14 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
 return (pmd_val(*pmd)  L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
   }
   
  +static inline void kvm_set_s2pud_readonly(pud_t *pud)
  +{
  +}
  +
  +static inline bool kvm_s2pud_readonly(pud_t *pud)
  +{
  +  return false;
  +}
   
   /* Open coded p*d_addr_end that can deal with 64bit addresses */
   #define kvm_pgd_addr_end(addr, end)   
  \
  diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
  index 59003df..35840fb 100644
  --- a/arch/arm/kvm/mmu.c
  +++ b/arch/arm/kvm/mmu.c
  @@ -109,6 +109,55 @@ void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t 
  addr, pmd_t *pmd)
 }
   }
   
  +/**
  +  * stage2_find_pud() - find a PUD entry
  +  * @kvm: pointer to kvm structure.
  +  * @addr:IPA address
  +  *
  +  * Return address of PUD entry or NULL if not allocated.
  +  */
  +static pud_t *stage2_find_pud(struct kvm *kvm, phys_addr_t addr)
  
  why can't you reuse stage2_get_pud here?
 
 stage2_get_* allocate intermediate tables, when they're called
 you know intermediate tables are needed to install a pmd or pte.
 But currently there is no way to tell we faulted in a PUD
 region, this code just checks if a PUD is set, and not
 allocate intermediate tables along the way.

hmmm, but if we get here it means that we are faulting on an address, so
we need to map something at that address regardless, so I don't see the
problem in using stage2_get_pud.

 
 Overall not sure if this is in preparation for a new huge page (PUD sized)?
 Besides developing a custom test, not sure how to use this
 and determine we fault in a PUD region? Generic 'gup'
 code does handle PUDs but perhaps some arch. has PUD sized
 huge pages.
 

When Marc and I discussed this we came to the conclusion that we wanted
code to support this code path for when huge PUDs were suddently used,
but now when I see the code, I am realizing that adding huge PUD support
on the Stage-2 level requires a lot of changes to this file, so I really
don't think we need to handle it at the point after all.

  
  +{
  +  pgd_t *pgd;
  +
  +  pgd = kvm-arch.pgd + pgd_index(addr);
  +  if (pgd_none(*pgd))
  +  return NULL;
  +
  +  return pud_offset(pgd, addr);
  +}
  +
  +/**
  + * stage2_dissolve_pud() - clear and flush huge PUD entry
  + * @kvm:  pointer to kvm structure.
  + * @addr  IPA
  + *
  + * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. 
  Marks all
  + * pages in the range dirty.
  + */
  +void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr)
  +{
  +  pud_t *pud;
  +  gfn_t gfn;
  +  long i;
  +
  +  pud = stage2_find_pud(kvm, addr);
  +  if (pud  !pud_none(*pud)  kvm_pud_huge(*pud)) {
  
  I'm just thinking here, why do we need to check if we get a valid pud
  back here, but we don't need the equivalent check in dissolve_pmd from
  patch 7?
 
 kvm_pud_huge() doesn't check bit 0 for invalid entry, but
 pud_none() is not the right way to check either, maybe pud_bad()
 first. Nothing is done in patch 7 since the pmd is retrieved from
 stage2_get_pmd().
 

hmmm, but stage2_get_pmd() can return a NULL pointer if you have the
IOMAP flag set...

  
  I think the rationale is that it should never happen because we never
  call these functions with the logging and iomap flags at the same
  time...
 
 I'm little lost here, not sure how it's related to above.
 But I think a VFIO device will have a 

Re: [PATCHv2] arch:x86:kvm:Add function for setting pending timer on virtual cpu in x86.c

2015-01-08 Thread Paolo Bonzini


On 02/01/2015 04:05, Nicholas Krause wrote:
 Adds the function kvm_vcpu_set_pending_timer for setting a pending timer on a 
 virtual cpu for x86 systems.  This
 function calls kvm_make_request internally as moved over from lapic.c with 
 the arugments of the virtual cpu passed
 to the function kvm_vcpu_set_pending_timer and the flag of 
 KVM_REQ_PENDING_TIMER for requesting a pending timer
 to kvm. In additon we added a function prototype definition as to allow this 
 function to be used by any files that
 include the header file,x86.h and need to set the pending timer on a virtual 
 cpu supported by kvm.
 Signed-off-by: Nicholas Krause xerofo...@gmail.com
 ---
  arch/x86/kvm/lapic.c | 3 +--
  arch/x86/kvm/x86.c   | 3 +++
  arch/x86/kvm/x86.h   | 1 +
  3 files changed, 5 insertions(+), 2 deletions(-)
 
 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
 index 4f0c0b9..6d69b49 100644
 --- a/arch/x86/kvm/lapic.c
 +++ b/arch/x86/kvm/lapic.c
 @@ -1082,8 +1082,7 @@ static void apic_timer_expired(struct kvm_lapic *apic)
   return;
  
   atomic_inc(apic-lapic_timer.pending);
 - /* FIXME: this code should not know anything about vcpus */
 - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
 + kvm_set_pending_timer(vcpu);
  
   if (waitqueue_active(q))
   wake_up_interruptible(q);
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index c259814..7f8d048 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -1083,6 +1083,9 @@ static void update_pvclock_gtod(struct timekeeper *tk)

The patch didn't apply as is because the ,9 should have been ,10.

Fixed up and applied.

Paolo

  }
  #endif
  
 +void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
 +{
 + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
 +}
  
  static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
  {
 diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
 index cc1d61a..4b7a819 100644
 --- a/arch/x86/kvm/x86.h
 +++ b/arch/x86/kvm/x86.h
 @@ -147,6 +147,7 @@ static inline void kvm_register_writel(struct kvm_vcpu 
 *vcpu,
  
  void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
  void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 +void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
  int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int 
 inc_eip);
  
  void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/4] arm/arm64: KVM: Random selection of MM related fixes

2015-01-08 Thread Marc Zyngier
This small series fixes a number of issues that Christoffer and I have
been trying to nail down for a while, having to do with the host dying
under load (swapping), and also with the way we deal with caches in
general (and with set/way operation in particular):

- The first patch, courtesy of Steve Capper, fixes a long standing
  buglet in one of the MMU notifiers.

- The second one changes the way we handle cache ops by set/way,
  basically turning them into VA ops for the whole memory. This allows
  platforms with system caches to boot a 32bit zImage, for example.

- The third one fixes a corner case that could happen if the guest
  used an uncached mapping (or had its caches off) while the host was
  swapping it out (and using a cache-coherent IO subsystem).

- Finally, the last one fixes this stability issue when the host was
  swapping, by using a kernel mapping for cache maintenance instead of
  the userspace one.

With these patches (and both the TLB invalidation fix I posted before
Xmas and the HCR fix posted yesterday), the APM platform seems much
more robust than it was. Fingers crossed.

Based on 3.19-rc3, tested on Juno, X-Gene and Cubietruck.

Also at git://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git 
kvm-arm64/mm-fixes-3.19

Marc Zyngier (3):
  arm/arm64: KVM: Use set/way op trapping to track the state of the
caches
  arm/arm64: KVM: Flush caches to memory on unmap
  arm/arm64: KVM: use kernel mapping to perform invalidation on page
fault

Steve Capper (1):
  mm: Correct ordering of *_clear_flush_young_notify

 arch/arm/include/asm/kvm_host.h   |   3 --
 arch/arm/include/asm/kvm_mmu.h|  68 +
 arch/arm/kvm/arm.c|  10 
 arch/arm/kvm/coproc.c |  90 +
 arch/arm/kvm/mmu.c|  58 +++---
 arch/arm64/include/asm/kvm_host.h |   3 --
 arch/arm64/include/asm/kvm_mmu.h  |  31 ++--
 arch/arm64/kvm/sys_regs.c | 102 ++
 include/linux/mmu_notifier.h  |   8 +--
 9 files changed, 244 insertions(+), 129 deletions(-)

-- 
2.1.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/4] arm/arm64: KVM: Flush caches to memory on unmap

2015-01-08 Thread Marc Zyngier
Let's assume a guest has created an uncached mapping, and written
to that page. Let's also assume that the host uses a cache-coherent
IO subsystem. Let's finally assume that the host is under memory
pressure and starts to swap things out.

Before this uncached page is evicted, we need to make sure it
gets invalidated, or the IO subsystem is going to swap out the
cached view, loosing the data that has been written there.

Signed-off-by: Marc Zyngier marc.zyng...@arm.com
---
 arch/arm/include/asm/kvm_mmu.h   | 31 +++
 arch/arm/kvm/mmu.c   | 46 +++-
 arch/arm64/include/asm/kvm_mmu.h | 18 
 3 files changed, 80 insertions(+), 15 deletions(-)

diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 63e0ecc..7ceb836 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -44,6 +44,7 @@
 
 #ifndef __ASSEMBLY__
 
+#include linux/highmem.h
 #include asm/cacheflush.h
 #include asm/pgalloc.h
 
@@ -190,6 +191,36 @@ static inline void coherent_cache_guest_page(struct 
kvm_vcpu *vcpu, hva_t hva,
 
 #define kvm_virt_to_phys(x)virt_to_idmap((unsigned long)(x))
 
+static inline void __kvm_flush_dcache_pte(pte_t pte)
+{
+   void *va = kmap_atomic(pte_page(pte));
+
+   kvm_flush_dcache_to_poc(va, PAGE_SIZE);
+
+   kunmap_atomic(va);
+}
+
+static inline void __kvm_flush_dcache_pmd(pmd_t pmd)
+{
+   unsigned long size = PMD_SIZE;
+   pfn_t pfn = pmd_pfn(pmd);
+
+   while (size) {
+   void *va = kmap_atomic_pfn(pfn);
+
+   kvm_flush_dcache_to_poc(va, PAGE_SIZE);
+
+   pfn++;
+   size -= PAGE_SIZE;
+
+   kunmap_atomic(va);
+   }
+}
+
+static inline void __kvm_flush_dcache_pud(pud_t pud)
+{
+}
+
 void stage2_flush_vm(struct kvm *kvm);
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 1dc9778..1f5b793 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -58,6 +58,21 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, 
phys_addr_t ipa)
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
 }
 
+static void kvm_flush_dcache_pte(pte_t pte)
+{
+   __kvm_flush_dcache_pte(pte);
+}
+
+static void kvm_flush_dcache_pmd(pmd_t pmd)
+{
+   __kvm_flush_dcache_pmd(pmd);
+}
+
+static void kvm_flush_dcache_pud(pud_t pud)
+{
+   __kvm_flush_dcache_pud(pud);
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
  int min, int max)
 {
@@ -128,9 +143,12 @@ static void unmap_ptes(struct kvm *kvm, pmd_t *pmd,
start_pte = pte = pte_offset_kernel(pmd, addr);
do {
if (!pte_none(*pte)) {
+   pte_t old_pte = *pte;
kvm_set_pte(pte, __pte(0));
-   put_page(virt_to_page(pte));
kvm_tlb_flush_vmid_ipa(kvm, addr);
+   if ((pte_val(old_pte)  PAGE_S2_DEVICE) != 
PAGE_S2_DEVICE)
+   kvm_flush_dcache_pte(old_pte);
+   put_page(virt_to_page(pte));
}
} while (pte++, addr += PAGE_SIZE, addr != end);
 
@@ -149,8 +167,10 @@ static void unmap_pmds(struct kvm *kvm, pud_t *pud,
next = kvm_pmd_addr_end(addr, end);
if (!pmd_none(*pmd)) {
if (kvm_pmd_huge(*pmd)) {
+   pmd_t old_pmd = *pmd;
pmd_clear(pmd);
kvm_tlb_flush_vmid_ipa(kvm, addr);
+   kvm_flush_dcache_pmd(old_pmd);
put_page(virt_to_page(pmd));
} else {
unmap_ptes(kvm, pmd, addr, next);
@@ -173,8 +193,10 @@ static void unmap_puds(struct kvm *kvm, pgd_t *pgd,
next = kvm_pud_addr_end(addr, end);
if (!pud_none(*pud)) {
if (pud_huge(*pud)) {
+   pud_t old_pud = *pud;
pud_clear(pud);
kvm_tlb_flush_vmid_ipa(kvm, addr);
+   kvm_flush_dcache_pud(old_pud);
put_page(virt_to_page(pud));
} else {
unmap_pmds(kvm, pud, addr, next);
@@ -209,10 +231,8 @@ static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
 
pte = pte_offset_kernel(pmd, addr);
do {
-   if (!pte_none(*pte)) {
-   hva_t hva = gfn_to_hva(kvm, addr  PAGE_SHIFT);
-   kvm_flush_dcache_to_poc((void*)hva, PAGE_SIZE);
-   }
+   if (!pte_none(*pte))
+   kvm_flush_dcache_pte(*pte);
} while (pte++, addr += PAGE_SIZE, addr != end);
 }
 
@@ -226,12 +246,10 @@ static void 

[PATCH 4/4] arm/arm64: KVM: use kernel mapping to perform invalidation on page fault

2015-01-08 Thread Marc Zyngier
When handling a fault in stage-2, we need to resync I$ and D$, just
to be sure we don't leave any old cache line behind.

That's very good, except that we do so using the *user* address.
Under heavy load (swapping like crazy), we may end up in a situation
where the page gets mapped in stage-2 while being unmapped from
userspace by another CPU.

At that point, the DC/IC instructions can generate a fault, which
we handle with kvm-mmu_lock held. The box quickly deadlocks, user
is unhappy.

Instead, perform this invalidation through the kernel mapping,
which is guaranteed to be present. The box is much happier, and so
am I.

Signed-off-by: Marc Zyngier marc.zyng...@arm.com
---
 arch/arm/include/asm/kvm_mmu.h   | 37 -
 arch/arm/kvm/mmu.c   | 12 
 arch/arm64/include/asm/kvm_mmu.h | 13 -
 3 files changed, 44 insertions(+), 18 deletions(-)

diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 7ceb836..be6bc72 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -162,13 +162,10 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu 
*vcpu)
return (vcpu-arch.cp15[c1_SCTLR]  0b101) == 0b101;
 }
 
-static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
-unsigned long size,
-bool ipa_uncached)
+static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t 
pfn,
+  unsigned long size,
+  bool ipa_uncached)
 {
-   if (!vcpu_has_cache_enabled(vcpu) || ipa_uncached)
-   kvm_flush_dcache_to_poc((void *)hva, size);
-   
/*
 * If we are going to insert an instruction page and the icache is
 * either VIPT or PIPT, there is a potential problem where the host
@@ -180,10 +177,32 @@ static inline void coherent_cache_guest_page(struct 
kvm_vcpu *vcpu, hva_t hva,
 *
 * VIVT caches are tagged using both the ASID and the VMID and doesn't
 * need any kind of flushing (DDI 0406C.b - Page B3-1392).
+*
+* We need to do this through a kernel mapping (using the
+* user-space mapping has proved to be the wrong
+* solution). For that, we need to kmap one page at a time,
+* and iterate over the range.
 */
-   if (icache_is_pipt()) {
-   __cpuc_coherent_user_range(hva, hva + size);
-   } else if (!icache_is_vivt_asid_tagged()) {
+
+   VM_BUG_ON(size  PAGE_MASK);
+
+   while (size) {
+   void *va = kmap_atomic_pfn(pfn);
+
+   if (!vcpu_has_cache_enabled(vcpu) || ipa_uncached)
+   kvm_flush_dcache_to_poc(va, PAGE_SIZE);
+   
+   if (icache_is_pipt())
+   __cpuc_coherent_user_range((unsigned long)va,
+  (unsigned long)va + 
PAGE_SIZE);
+
+   size -= PAGE_SIZE;
+   pfn++;
+
+   kunmap_atomic(va);
+   }
+
+   if (!icache_is_pipt()  !icache_is_vivt_asid_tagged()) {
/* any kind of VIPT cache */
__flush_icache_all();
}
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 1f5b793..4da6504 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -921,6 +921,12 @@ static bool kvm_is_device_pfn(unsigned long pfn)
return !pfn_valid(pfn);
 }
 
+static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn,
+ unsigned long size, bool uncached)
+{
+   __coherent_cache_guest_page(vcpu, pfn, size, uncached);
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
  struct kvm_memory_slot *memslot, unsigned long hva,
  unsigned long fault_status)
@@ -1010,8 +1016,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
kvm_set_s2pmd_writable(new_pmd);
kvm_set_pfn_dirty(pfn);
}
-   coherent_cache_guest_page(vcpu, hva  PMD_MASK, PMD_SIZE,
- fault_ipa_uncached);
+   coherent_cache_guest_page(vcpu, pfn, PMD_SIZE, 
fault_ipa_uncached);
ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, new_pmd);
} else {
pte_t new_pte = pfn_pte(pfn, mem_type);
@@ -1019,8 +1024,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
kvm_set_s2pte_writable(new_pte);
kvm_set_pfn_dirty(pfn);
}
-   coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
- fault_ipa_uncached);
+   coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE, 

Re: [PATCHv2] arch:x86:kvm:Add function for setting pending timer on virtual cpu in x86.c

2015-01-08 Thread Andrej Manduch
Hi

On 01/08/2015 06:57 AM, Paolo Bonzini wrote:
 
 
 On 02/01/2015 04:05, Nicholas Krause wrote:
 Adds the function kvm_vcpu_set_pending_timer for setting a pending timer on 
 a virtual cpu for x86 systems.  This
 function calls kvm_make_request internally as moved over from lapic.c with 
 the arugments of the virtual cpu passed
 to the function kvm_vcpu_set_pending_timer and the flag of 
 KVM_REQ_PENDING_TIMER for requesting a pending timer
 to kvm. In additon we added a function prototype definition as to allow this 
 function to be used by any files that
 include the header file,x86.h and need to set the pending timer on a virtual 
 cpu supported by kvm.
 Signed-off-by: Nicholas Krause xerofo...@gmail.com
 ---
  arch/x86/kvm/lapic.c | 3 +--
  arch/x86/kvm/x86.c   | 3 +++
  arch/x86/kvm/x86.h   | 1 +
  3 files changed, 5 insertions(+), 2 deletions(-)

 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
 index 4f0c0b9..6d69b49 100644
 --- a/arch/x86/kvm/lapic.c
 +++ b/arch/x86/kvm/lapic.c
 @@ -1082,8 +1082,7 @@ static void apic_timer_expired(struct kvm_lapic *apic)
  return;
  
  atomic_inc(apic-lapic_timer.pending);
 -/* FIXME: this code should not know anything about vcpus */
I don't want to sounds like I'm nitpicking. But I need to ask. Why is
this comment removed?

 -kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
 +kvm_set_pending_timer(vcpu);
  
  if (waitqueue_active(q))
  wake_up_interruptible(q);
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index c259814..7f8d048 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -1083,6 +1083,9 @@ static void update_pvclock_gtod(struct timekeeper *tk)
 
 The patch didn't apply as is because the ,9 should have been ,10.
 
 Fixed up and applied.
 
 Paolo
 
  }
  #endif
  
 +void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
 +{
 +kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
 +}
  
  static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
  {
 diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
 index cc1d61a..4b7a819 100644
 --- a/arch/x86/kvm/x86.h
 +++ b/arch/x86/kvm/x86.h
 @@ -147,6 +147,7 @@ static inline void kvm_register_writel(struct kvm_vcpu 
 *vcpu,
  
  void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
  void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 +void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
  int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int 
 inc_eip);
  
  void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);

 
 --
 To unsubscribe from this list: send the line unsubscribe linux-kernel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 Please read the FAQ at  http://www.tux.org/lkml/
 

-- 
Kind regards,
b.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v15 11/11] KVM: arm/arm64: Add support to dissolve huge PUD

2015-01-08 Thread Christoffer Dall
On Wed, Jan 07, 2015 at 07:01:10PM -0800, Mario Smarduch wrote:
 On 01/07/2015 05:05 AM, Christoffer Dall wrote:
  On Sun, Dec 14, 2014 at 11:28:08PM -0800, Mario Smarduch wrote:
  This patch adds the same support for PUD huge page as for PMD. Huge PUD is 
  write protected for initial memory region write protection. Code to 
  dissolve 
  huge PUD is supported in user_mem_abort(). At this time this code has not 
  been 
  tested, but similar approach to current ARMv8 page logging test is in work,
  limiting kernel memory and mapping in 1 or 2GB into Guest address space on 
  a 
  4k page/48 bit host, some host kernel test code needs to be added to detect
  page fault to this region and side step general processing. Also similar 
  to 
  PMD case all pages in range are marked dirty when PUD entry is cleared.
  
  the note about this code being untested shouldn't be part of the commit
  message but after the '---' separater or in the cover letter I think.
 
 Ah ok.
  
 
  Signed-off-by: Mario Smarduch m.smard...@samsung.com
  ---
   arch/arm/include/asm/kvm_mmu.h |  8 +
   arch/arm/kvm/mmu.c | 64 
  --
   arch/arm64/include/asm/kvm_mmu.h   |  9 +
   arch/arm64/include/asm/pgtable-hwdef.h |  3 ++
   4 files changed, 81 insertions(+), 3 deletions(-)
 
  diff --git a/arch/arm/include/asm/kvm_mmu.h 
  b/arch/arm/include/asm/kvm_mmu.h
  index dda0046..703d04d 100644
  --- a/arch/arm/include/asm/kvm_mmu.h
  +++ b/arch/arm/include/asm/kvm_mmu.h
  @@ -133,6 +133,14 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
 return (pmd_val(*pmd)  L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
   }
   
  +static inline void kvm_set_s2pud_readonly(pud_t *pud)
  +{
  +}
  +
  +static inline bool kvm_s2pud_readonly(pud_t *pud)
  +{
  +  return false;
  +}
   
   /* Open coded p*d_addr_end that can deal with 64bit addresses */
   #define kvm_pgd_addr_end(addr, end)   
  \
  diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
  index 59003df..35840fb 100644
  --- a/arch/arm/kvm/mmu.c
  +++ b/arch/arm/kvm/mmu.c
  @@ -109,6 +109,55 @@ void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t 
  addr, pmd_t *pmd)
 }
   }
   
  +/**
  +  * stage2_find_pud() - find a PUD entry
  +  * @kvm: pointer to kvm structure.
  +  * @addr:IPA address
  +  *
  +  * Return address of PUD entry or NULL if not allocated.
  +  */
  +static pud_t *stage2_find_pud(struct kvm *kvm, phys_addr_t addr)
  
  why can't you reuse stage2_get_pud here?
 
 stage2_get_* allocate intermediate tables, when they're called
 you know intermediate tables are needed to install a pmd or pte.
 But currently there is no way to tell we faulted in a PUD
 region, this code just checks if a PUD is set, and not
 allocate intermediate tables along the way.

hmmm, but if we get here it means that we are faulting on an address, so
we need to map something at that address regardless, so I don't see the
problem in using stage2_get_pud.

 
 Overall not sure if this is in preparation for a new huge page (PUD sized)?
 Besides developing a custom test, not sure how to use this
 and determine we fault in a PUD region? Generic 'gup'
 code does handle PUDs but perhaps some arch. has PUD sized
 huge pages.
 

When Marc and I discussed this we came to the conclusion that we wanted
code to support this code path for when huge PUDs were suddently used,
but now when I see the code, I am realizing that adding huge PUD support
on the Stage-2 level requires a lot of changes to this file, so I really
don't think we need to handle it at the point after all.

  
  +{
  +  pgd_t *pgd;
  +
  +  pgd = kvm-arch.pgd + pgd_index(addr);
  +  if (pgd_none(*pgd))
  +  return NULL;
  +
  +  return pud_offset(pgd, addr);
  +}
  +
  +/**
  + * stage2_dissolve_pud() - clear and flush huge PUD entry
  + * @kvm:  pointer to kvm structure.
  + * @addr  IPA
  + *
  + * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. 
  Marks all
  + * pages in the range dirty.
  + */
  +void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr)
  +{
  +  pud_t *pud;
  +  gfn_t gfn;
  +  long i;
  +
  +  pud = stage2_find_pud(kvm, addr);
  +  if (pud  !pud_none(*pud)  kvm_pud_huge(*pud)) {
  
  I'm just thinking here, why do we need to check if we get a valid pud
  back here, but we don't need the equivalent check in dissolve_pmd from
  patch 7?
 
 kvm_pud_huge() doesn't check bit 0 for invalid entry, but
 pud_none() is not the right way to check either, maybe pud_bad()
 first. Nothing is done in patch 7 since the pmd is retrieved from
 stage2_get_pmd().
 

hmmm, but stage2_get_pmd() can return a NULL pointer if you have the
IOMAP flag set...

  
  I think the rationale is that it should never happen because we never
  call these functions with the logging and iomap flags at the same
  time...
 
 I'm little lost here, not sure how it's related to above.
 But I think a VFIO device will have a 

[PATCH 1/4] mm: Correct ordering of *_clear_flush_young_notify

2015-01-08 Thread Marc Zyngier
From: Steve Capper steve.cap...@linaro.org

ptep_clear_flush_young_notify and pmdp_clear_flush_young_notify both
call the notifiers *after* the pte/pmd has been made young.

This can cause problems with KVM that relies on being able to block
MMU notifiers when carrying out maintenance of second stage
descriptors.

This patch ensures that the MMU notifiers are called before ptes and
pmds are made old.

Signed-off-by: Steve Capper steve.cap...@linaro.org
---
 include/linux/mmu_notifier.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 95243d2..c454c76 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -290,11 +290,11 @@ static inline void mmu_notifier_mm_destroy(struct 
mm_struct *mm)
int __young;\
struct vm_area_struct *___vma = __vma;  \
unsigned long ___address = __address;   \
-   __young = ptep_clear_flush_young(___vma, ___address, __ptep);   \
-   __young |= mmu_notifier_clear_flush_young(___vma-vm_mm,\
+   __young = mmu_notifier_clear_flush_young(___vma-vm_mm, \
  ___address,   \
  ___address +  \
PAGE_SIZE); \
+   __young |= ptep_clear_flush_young(___vma, ___address, __ptep);  \
__young;\
 })
 
@@ -303,11 +303,11 @@ static inline void mmu_notifier_mm_destroy(struct 
mm_struct *mm)
int __young;\
struct vm_area_struct *___vma = __vma;  \
unsigned long ___address = __address;   \
-   __young = pmdp_clear_flush_young(___vma, ___address, __pmdp);   \
-   __young |= mmu_notifier_clear_flush_young(___vma-vm_mm,\
+   __young = mmu_notifier_clear_flush_young(___vma-vm_mm, \
  ___address,   \
  ___address +  \
PMD_SIZE);  \
+   __young |= pmdp_clear_flush_young(___vma, ___address, __pmdp);  \
__young;\
 })
 
-- 
2.1.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/4] arm/arm64: KVM: Use set/way op trapping to track the state of the caches

2015-01-08 Thread Marc Zyngier
Trying to emulate the behaviour of set/way cache ops is fairly
pointless, as there are too many ways we can end-up missing stuff.
Also, there is some system caches out there that simply ignore
set/way operations.

So instead of trying to implement them, let's convert it to VA ops,
and use them as a way to re-enable the trapping of VM ops. That way,
we can detect the point when the MMU/caches are turned off, and do
a full VM flush (which is what the guest was trying to do anyway).

This allows a 32bit zImage to boot on the APM thingy, and will
probably help bootloaders in general.

Signed-off-by: Marc Zyngier marc.zyng...@arm.com
---
 arch/arm/include/asm/kvm_host.h   |   3 --
 arch/arm/kvm/arm.c|  10 
 arch/arm/kvm/coproc.c |  90 +
 arch/arm64/include/asm/kvm_host.h |   3 --
 arch/arm64/kvm/sys_regs.c | 102 ++
 5 files changed, 116 insertions(+), 92 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 254e065..04b4ea0 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -125,9 +125,6 @@ struct kvm_vcpu_arch {
 * Anything that is not used directly from assembly code goes
 * here.
 */
-   /* dcache set/way operation pending */
-   int last_pcpu;
-   cpumask_t require_dcache_flush;
 
/* Don't run the guest on this vcpu */
bool pause;
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 2d6d910..0b0d58a 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -281,15 +281,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
vcpu-cpu = cpu;
vcpu-arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
 
-   /*
-* Check whether this vcpu requires the cache to be flushed on
-* this physical CPU. This is a consequence of doing dcache
-* operations by set/way on this vcpu. We do it here to be in
-* a non-preemptible section.
-*/
-   if (cpumask_test_and_clear_cpu(cpu, vcpu-arch.require_dcache_flush))
-   flush_cache_all(); /* We'd really want v7_flush_dcache_all() */
-
kvm_arm_set_running_vcpu(vcpu);
 }
 
@@ -541,7 +532,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct 
kvm_run *run)
ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
 
vcpu-mode = OUTSIDE_GUEST_MODE;
-   vcpu-arch.last_pcpu = smp_processor_id();
kvm_guest_exit();
trace_kvm_exit(*vcpu_pc(vcpu));
/*
diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c
index 7928dbd..3d352a5 100644
--- a/arch/arm/kvm/coproc.c
+++ b/arch/arm/kvm/coproc.c
@@ -189,43 +189,57 @@ static bool access_l2ectlr(struct kvm_vcpu *vcpu,
return true;
 }
 
-/* See note at ARM ARM B1.14.4 */
+/*
+ * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
+ *
+ * Main problems:
+ * - S/W ops are local to a CPU (not broadcast)
+ * - We have line migration behind our back (speculation)
+ * - System caches don't support S/W at all (damn!)
+ *
+ * In the face of the above, the best we can do is to try and convert
+ * S/W ops to VA ops. Because the guest is not allowed to infer the
+ * S/W to PA mapping, it can only use S/W to nuke the whole cache,
+ * which is a rather good thing for us.
+ *
+ * Also, it is only used when turning caches on/off (The expected
+ * usage of the cache maintenance instructions that operate by set/way
+ * is associated with the cache maintenance instructions associated
+ * with the powerdown and powerup of caches, if this is required by
+ * the implementation.).
+ *
+ * We use the following policy:
+ *
+ * - If we trap a S/W operation, we enable VM trapping to detect
+ *   caches being turned on/off.
+ *
+ * - If the caches have already been turned off when doing the S/W op,
+ *   we nuke the whole VM cache.
+ *
+ * - We flush the cache on both caches being turned on and off.
+ *
+ * - Once the caches are enabled, we stop trapping VM ops.
+ */
 static bool access_dcsw(struct kvm_vcpu *vcpu,
const struct coproc_params *p,
const struct coproc_reg *r)
 {
-   unsigned long val;
-   int cpu;
-
if (!p-is_write)
return read_from_write_only(vcpu, p);
 
-   cpu = get_cpu();
-
-   cpumask_setall(vcpu-arch.require_dcache_flush);
-   cpumask_clear_cpu(cpu, vcpu-arch.require_dcache_flush);
-
-   /* If we were already preempted, take the long way around */
-   if (cpu != vcpu-arch.last_pcpu) {
-   flush_cache_all();
-   goto done;
-   }
-
-   val = *vcpu_reg(vcpu, p-Rt1);
-
-   switch (p-CRm) {
-   case 6: /* Upgrade DCISW to DCCISW, as per HCR.SWIO */
-   case 14:/* DCCISW */
-   asm volatile(mcr p15, 0, %0, c7, c14, 2 : : r 

Re: [PATCH 2/3] x86_64,entry: Use sysret to return to userspace when possible

2015-01-08 Thread Borislav Petkov
On Fri, Nov 07, 2014 at 03:58:18PM -0800, Andy Lutomirski wrote:
 diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
 index 3710b8241945..a5afdf0f7fa4 100644
 --- a/arch/x86/kernel/entry_64.S
 +++ b/arch/x86/kernel/entry_64.S
 @@ -804,6 +804,54 @@ retint_swapgs:   /* return to user-space */

Ok, so retint_swapgs is also on the error_exit path.

What you're basically proposing is to use SYSRET on exceptions exit
too AFAICT. And while I don't see anything wrong with the patch, you
probably need to run this by more people like tip guys + Linus just in
case. We can't allow ourselves to leak stuff here.

*/
   DISABLE_INTERRUPTS(CLBR_ANY)
   TRACE_IRQS_IRETQ
 +
 + /*
 +  * Try to use SYSRET instead of IRET if we're returning to
 +  * a completely clean 64-bit userspace context.
 +  */
 + movq (RCX-R11)(%rsp), %rcx
 + cmpq %rcx,(RIP-R11)(%rsp)   /* RCX == RIP */
 + jne opportunistic_sysret_failed
 +
 + /*
 +  * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
 +  * in kernel space.  This essentially lets the user take over
 +  * the kernel, since userspace controls RSP.  It's not worth
 +  * testing for canonicalness exactly -- this check detects any
 +  * of the 17 high bits set, which is true for non-canonical
 +  * or kernel addresses.  (This will pessimize vsyscall=native.
 +  * Big deal.)
 +  */
 + shr $47, %rcx

shr $__VIRTUAL_MASK_SHIFT, %rcx

I guess, in case someone decides to play with the address space again
and forgets this naked bit here.

 + jnz opportunistic_sysret_failed
 +
 + cmpq $__USER_CS,(CS-R11)(%rsp)  /* CS must match SYSRET */
 + jne opportunistic_sysret_failed
 +
 + movq (R11-R11)(%rsp), %r11
 + cmpq %r11,(EFLAGS-R11)(%rsp)/* R11 == RFLAGS */
 + jne opportunistic_sysret_failed
 +
 + testq $X86_EFLAGS_RF,%r11   /* sysret can't restore RF */
 + jnz opportunistic_sysret_failed
 +
 + /* nothing to check for RSP */
 +
 + cmpq $__USER_DS,(SS-R11)(%rsp)  /* SS must match SYSRET */
 + jne opportunistic_sysret_failed
 +
 + /*
 +  * We win!  This label is here just for ease of understanding
 +  * perf profiles.  Nothing jumps here.
 +  */
 +irq_return_via_sysret:
 + CFI_REMEMBER_STATE
 + RESTORE_ARGS 1,8,1
 + movq (RSP-RIP)(%rsp),%rsp
 + USERGS_SYSRET64
 + CFI_RESTORE_STATE
 +
 +opportunistic_sysret_failed:
   SWAPGS
   jmp restore_args

Ok, dammit, it happened again:

...
[   13.480778] BTRFS info (device sda9): disk space caching is enabled
[   13.487270] BTRFS: has skinny extents
[   14.368392] IPv6: ADDRCONF(NETDEV_UP): eth0: link is not ready
[   15.928679] e1000e: eth0 NIC Link is Up 100 Mbps Full Duplex, Flow Control: 
Rx/Tx
[   15.936406] e1000e :00:19.0 eth0: 10/100 speed: disabling TSO
[   15.942879] IPv6: ADDRCONF(NETDEV_CHANGE): eth0: link becomes ready
[  115.065408] ata1.00: exception Emask 0x0 SAct 0x7fd8 SErr 0x0 action 0x6 
frozen
[  115.073159] ata1.00: failed command: WRITE FPDMA QUEUED
[  115.078459] ata1.00: cmd 61/80:98:c0:e7:35/4a:00:1f:00:00/40 tag 19 ncq 
9764864 out
 res 40/00:00:00:00:00/00:00:00:00:00/00 Emask 0x4 (timeout)
[  115.093623] ata1.00: status: { DRDY }
[  115.097314] ata1.00: failed command: WRITE FPDMA QUEUED
[  115.102569] ata1.00: cmd 61/30:a0:40:32:36/20:00:1f:00:00/40 tag 20 ncq 
4218880 out
 res 40/00:00:00:00:00/00:00:00:00:00/00 Emask 0x4 (timeout)
[  115.117668] ata1.00: status: { DRDY }
[  115.121351] ata1.00: failed command: WRITE FPDMA QUEUED
[  115.126602] ata1.00: cmd 61/80:b0:80:f7:37/20:00:1f:00:00/40 tag 22 ncq 
4259840 out
 res 40/00:00:00:00:00/00:00:00:00:00/00 Emask 0x4 (timeout)
[  115.141701] ata1.00: status: { DRDY }
[  115.145389] ata1.00: failed command: WRITE FPDMA QUEUED
[  115.150638] ata1.00: cmd 61/90:b8:70:52:36/03:00:1f:00:00/40 tag 23 ncq 
466944 out
 res 40/00:00:00:00:00/00:00:00:00:00/00 Emask 0x4 (timeout)
[  115.165682] ata1.00: status: { DRDY }
[  115.169357] ata1.00: failed command: WRITE FPDMA QUEUED
[  115.174617] ata1.00: cmd 61/c0:c0:00:58:36/39:00:1f:00:00/40 tag 24 ncq 
7569408 out
 res 40/00:00:00:00:00/00:00:00:00:00/00 Emask 0x4 (timeout)
[  115.189713] ata1.00: status: { DRDY }
[  115.193400] ata1.00: failed command: WRITE FPDMA QUEUED
[  115.198650] ata1.00: cmd 61/80:c8:c0:91:36/4b:00:1f:00:00/40 tag 25 ncq 
9895936 out
 res 40/00:00:00:00:00/00:00:00:00:00/00 Emask 0x4 (timeout)
[  115.213755] ata1.00: status: { DRDY }
[  115.217431] ata1.00: failed command: WRITE FPDMA QUEUED
[  115.222692] ata1.00: cmd 61/80:d0:40:dd:36/4a:00:1f:00:00/40 tag 26 ncq 
9764864 out
 res 40/00:00:00:00:00/00:00:00:00:00/00 Emask 0x4 (timeout)
[  115.237788] ata1.00: status: { DRDY }
[  115.241479] ata1.00: failed command: WRITE FPDMA QUEUED
[  115.246723] ata1.00: cmd 

Re: [PATCH 4/4] arm/arm64: KVM: use kernel mapping to perform invalidation on page fault

2015-01-08 Thread Peter Maydell
On 8 January 2015 at 11:59, Marc Zyngier marc.zyng...@arm.com wrote:
 @@ -180,10 +177,32 @@ static inline void coherent_cache_guest_page(struct 
 kvm_vcpu *vcpu, hva_t hva,
  *
  * VIVT caches are tagged using both the ASID and the VMID and doesn't
  * need any kind of flushing (DDI 0406C.b - Page B3-1392).
 +*
 +* We need to do this through a kernel mapping (using the
 +* user-space mapping has proved to be the wrong
 +* solution). For that, we need to kmap one page at a time,
 +* and iterate over the range.
  */
 -   if (icache_is_pipt()) {
 -   __cpuc_coherent_user_range(hva, hva + size);
 -   } else if (!icache_is_vivt_asid_tagged()) {
 +
 +   VM_BUG_ON(size  PAGE_MASK);
 +
 +   while (size) {
 +   void *va = kmap_atomic_pfn(pfn);
 +
 +   if (!vcpu_has_cache_enabled(vcpu) || ipa_uncached)
 +   kvm_flush_dcache_to_poc(va, PAGE_SIZE);
 +
 +   if (icache_is_pipt())
 +   __cpuc_coherent_user_range((unsigned long)va,
 +  (unsigned long)va + 
 PAGE_SIZE);
 +
 +   size -= PAGE_SIZE;
 +   pfn++;
 +
 +   kunmap_atomic(va);
 +   }

If (vcpu_has_cache_enabled(vcpu)  !ipa_uncached  !icache_is_pipt())
then we're going to run round this loop mapping and unmapping without
actually doing anything. Is it worth hoisting that check out of the
loop? (I think it's going to be the common case for a non-PIPT icache,
right?)

 +   if (!icache_is_pipt()  !icache_is_vivt_asid_tagged()) {
 /* any kind of VIPT cache */
 __flush_icache_all();
 }

Can you remind me why it's OK not to flush the icache for an
ASID tagged VIVT icache? Making this page coherent might actually
be revealing a change in the instructions associated with the VA,
mightn't it?

thanks
-- PMM
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2] arch:x86:kvm:Add function for setting pending timer on virtual cpu in x86.c

2015-01-08 Thread Paolo Bonzini


On 08/01/2015 13:15, Andrej Manduch wrote:
  -/* FIXME: this code should not know anything about vcpus */
 I don't want to sounds like I'm nitpicking. But I need to ask. Why is
 this comment removed?

Because the real point of the comment was that the code should not know
anything about VCPU requests.  But it's okay to call a function to tell
the VCPU we want you to call us back at kvm_inject_apic_timer_irqs.

Paolo

  -kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
  +kvm_set_pending_timer(vcpu);
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] arm/arm64: KVM: use kernel mapping to perform invalidation on page fault

2015-01-08 Thread Marc Zyngier
On 08/01/15 12:30, Peter Maydell wrote:
 On 8 January 2015 at 11:59, Marc Zyngier marc.zyng...@arm.com wrote:
 @@ -180,10 +177,32 @@ static inline void coherent_cache_guest_page(struct 
 kvm_vcpu *vcpu, hva_t hva,
  *
  * VIVT caches are tagged using both the ASID and the VMID and 
 doesn't
  * need any kind of flushing (DDI 0406C.b - Page B3-1392).
 +*
 +* We need to do this through a kernel mapping (using the
 +* user-space mapping has proved to be the wrong
 +* solution). For that, we need to kmap one page at a time,
 +* and iterate over the range.
  */
 -   if (icache_is_pipt()) {
 -   __cpuc_coherent_user_range(hva, hva + size);
 -   } else if (!icache_is_vivt_asid_tagged()) {
 +
 +   VM_BUG_ON(size  PAGE_MASK);
 +
 +   while (size) {
 +   void *va = kmap_atomic_pfn(pfn);
 +
 +   if (!vcpu_has_cache_enabled(vcpu) || ipa_uncached)
 +   kvm_flush_dcache_to_poc(va, PAGE_SIZE);
 +
 +   if (icache_is_pipt())
 +   __cpuc_coherent_user_range((unsigned long)va,
 +  (unsigned long)va + 
 PAGE_SIZE);
 +
 +   size -= PAGE_SIZE;
 +   pfn++;
 +
 +   kunmap_atomic(va);
 +   }
 
 If (vcpu_has_cache_enabled(vcpu)  !ipa_uncached  !icache_is_pipt())
 then we're going to run round this loop mapping and unmapping without
 actually doing anything. Is it worth hoisting that check out of the
 loop? (I think it's going to be the common case for a non-PIPT icache,
 right?)

Yup, that's a valid optimization.

 +   if (!icache_is_pipt()  !icache_is_vivt_asid_tagged()) {
 /* any kind of VIPT cache */
 __flush_icache_all();
 }
 
 Can you remind me why it's OK not to flush the icache for an
 ASID tagged VIVT icache? Making this page coherent might actually
 be revealing a change in the instructions associated with the VA,
 mightn't it?

ASID cached VIVT icaches are also VMID tagged. It is thus impossible for
stale cache lines to come with a new page. And if by synchronizing the
caches you obtain a different instruction stream, it means you've
restored the wrong page.

Thanks,

M.
-- 
Jazz is not dead. It just smells funny...
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Xen-devel] [RFC 2/2] x86, vdso, pvclock: Simplify and speed up the vdso pvclock reader

2015-01-08 Thread David Vrabel

On 23/12/2014 00:39, Andy Lutomirski wrote:

The pvclock vdso code was too abstracted to understand easily and
excessively paranoid.  Simplify it for a huge speedup.

This opens the door for additional simplifications, as the vdso no
longer accesses the pvti for any vcpu other than vcpu 0.

Before, vclock_gettime using kvm-clock took about 64ns on my machine.
With this change, it takes 19ns, which is almost as fast as the pure TSC
implementation.


Xen guests don't use any of this at the moment, and I don't think this 
change would prevent us from using it in the future, so:


Acked-by: David Vrabel david.vra...@citrix.com

But see some additional comments below.


--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -78,47 +78,59 @@ static notrace const struct pvclock_vsyscall_time_info 
*get_pvti(int cpu)

  static notrace cycle_t vread_pvclock(int *mode)
  {
-   const struct pvclock_vsyscall_time_info *pvti;
+   const struct pvclock_vcpu_time_info *pvti = get_pvti(0)-pvti;


Xen updates pvti when scheduling a VCPU.  Using 0 here requires that 
VCPU 0 has been recently scheduled by Xen.  Perhaps using the current 
CPU here would be better?  It doesn't matter if the task is subsequently 
moved to a different CPU before using pvti.



+* Note: The kernel and hypervisor must guarantee that cpu ID
+* number maps 1:1 to per-CPU pvclock time info.
+*
+* Because the hypervisor is entirely unaware of guest userspace
+* preemption, it cannot guarantee that per-CPU pvclock time
+* info is updated if the underlying CPU changes or that that
+* version is increased whenever underlying CPU changes.
+*
+* On KVM, we are guaranteed that pvti updates for any vCPU are
+* atomic as seen by *all* vCPUs.  This is an even stronger
+* guarantee than we get with a normal seqlock.
 *
+* On Xen, we don't appear to have that guarantee, but Xen still
+* supplies a valid seqlock using the version field.
+
+* We only do pvclock vdso timing at all if
+* PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
+* mean that all vCPUs have matching pvti and that the TSC is
+* synced, so we can just look at vCPU 0's pvti.


I think this is a much stronger requirement than you actually need.

You only require:

- the system time (pvti-system_time) for all pvti's is synchronized; and
- TSC is synchronized; and
- the pvti has been updated sufficiently recently (so the error in the 
result is within acceptable margins).


Can you add documentation to arch/x86/include/asm/pvclock-abi.h to 
describe what properties PVCLOCK_TSC_STABLE_BIT guarantees?


David
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/4] mm: Correct ordering of *_clear_flush_young_notify

2015-01-08 Thread Paolo Bonzini
Andrea, Rik, please review this patch.

Thanks in advance,

Paolo

On 08/01/2015 12:59, Marc Zyngier wrote:
 From: Steve Capper steve.cap...@linaro.org
 
 ptep_clear_flush_young_notify and pmdp_clear_flush_young_notify both
 call the notifiers *after* the pte/pmd has been made young.
 
 This can cause problems with KVM that relies on being able to block
 MMU notifiers when carrying out maintenance of second stage
 descriptors.
 
 This patch ensures that the MMU notifiers are called before ptes and
 pmds are made old.
 
 Signed-off-by: Steve Capper steve.cap...@linaro.org
 ---
  include/linux/mmu_notifier.h | 8 
  1 file changed, 4 insertions(+), 4 deletions(-)
 
 diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
 index 95243d2..c454c76 100644
 --- a/include/linux/mmu_notifier.h
 +++ b/include/linux/mmu_notifier.h
 @@ -290,11 +290,11 @@ static inline void mmu_notifier_mm_destroy(struct 
 mm_struct *mm)
   int __young;\
   struct vm_area_struct *___vma = __vma;  \
   unsigned long ___address = __address;   \
 - __young = ptep_clear_flush_young(___vma, ___address, __ptep);   \
 - __young |= mmu_notifier_clear_flush_young(___vma-vm_mm,\
 + __young = mmu_notifier_clear_flush_young(___vma-vm_mm, \
 ___address,   \
 ___address +  \
   PAGE_SIZE); \
 + __young |= ptep_clear_flush_young(___vma, ___address, __ptep);  \
   __young;\
  })
  
 @@ -303,11 +303,11 @@ static inline void mmu_notifier_mm_destroy(struct 
 mm_struct *mm)
   int __young;\
   struct vm_area_struct *___vma = __vma;  \
   unsigned long ___address = __address;   \
 - __young = pmdp_clear_flush_young(___vma, ___address, __pmdp);   \
 - __young |= mmu_notifier_clear_flush_young(___vma-vm_mm,\
 + __young = mmu_notifier_clear_flush_young(___vma-vm_mm, \
 ___address,   \
 ___address +  \
   PMD_SIZE);  \
 + __young |= pmdp_clear_flush_young(___vma, ___address, __pmdp);  \
   __young;\
  })
  
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] arm/arm64: KVM: use kernel mapping to perform invalidation on page fault

2015-01-08 Thread Peter Maydell
On 8 January 2015 at 13:07, Marc Zyngier marc.zyng...@arm.com wrote:
 Can you remind me why it's OK not to flush the icache for an
 ASID tagged VIVT icache? Making this page coherent might actually
 be revealing a change in the instructions associated with the VA,
 mightn't it?

 ASID cached VIVT icaches are also VMID tagged. It is thus impossible for
 stale cache lines to come with a new page. And if by synchronizing the
 caches you obtain a different instruction stream, it means you've
 restored the wrong page.

...is that true even if the dirty data in the dcache comes from
the userspace process doing DMA or writing the initial boot
image or whatever?

-- PMM
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] vhost/net: length miscalculation

2015-01-08 Thread Michael S. Tsirkin
On Wed, Jan 07, 2015 at 11:58:00PM +0300, Sergei Shtylyov wrote:
 Hello.
 
 On 01/07/2015 11:55 AM, Michael S. Tsirkin wrote:
 
 commit 8b38694a2dc8b18374310df50174f1e4376d6824
  vhost/net: virtio 1.0 byte swap
 had this chunk:
 -   heads[headcount - 1].len += datalen;
 +   heads[headcount - 1].len = cpu_to_vhost32(vq, len - datalen);
 
 This adds datalen with the wrong sign, causing guest panics.
 
 Fixes: 8b38694a2dc8b18374310df50174f1e4376d6824
 
The format of this tag assumes 12-digit SHA1 hash and the commit
 description enclosed in parens and double quotes. See
 Documentation/SubmittingPatches.
 
 Reported-by: Alex Williamson alex.william...@redhat.com
 Suggested-by: Greg Kurz gk...@linux.vnet.ibm.com
 Signed-off-by: Michael S. Tsirkin m...@redhat.com
 
 WBR, Sergei

I pushed the patches to Linus unfortunately - there's
some urgency since many people are hitting the bug.
Will do my best to do it right next time.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/8] KVM: x86: Emulator fixes

2015-01-08 Thread Paolo Bonzini


On 25/12/2014 01:52, Nadav Amit wrote:
 Few more emulator fixes. Each is logically independent from the others.
 
 The first one is the most interesting one.  It appears that the current
 behavior may cause the VM to enter the page-fault handler twice on certain
 faulting write accesses. If you do not like my solution, please propose a
 better one.
 
 The fourth (JMP/CALL using call- or task-gate) is not a fix, but returns an
 error instead of emulating the wrong (#GP) exception.
 
 Thanks for reviewing the patches.
 
 Nadav Amit (8):
   KVM: x86: #PF error-code on R/W operations is wrong
   KVM: x86: pop sreg accesses only 2 bytes
   KVM: x86: fnstcw and fnstsw may cause spurious exception
   KVM: x86: JMP/CALL using call- or task-gate causes exception
   KVM: x86: em_call_far should return failure result
   KVM: x86: POP [ESP] is not emulated correctly
   KVM: x86: Do not set access bit on accessed segments
   KVM: x86: Access to LDT/GDT that wraparound is incorrect
 
  arch/x86/include/asm/kvm_host.h |  12 
  arch/x86/kvm/emulate.c  | 138 
 ++--
  arch/x86/kvm/mmu.h  |  12 
  3 files changed, 103 insertions(+), 59 deletions(-)
 

I'm applying patches 2-8.  I want to play a bit more with patch 1.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/4] fixes for changes in the percpu counter API

2015-01-08 Thread Paolo Bonzini
These API changes were introduced in 3.18.  Update source pointer
since 3.18 now compiles.

Signed-off-by: Paolo Bonzini pbonz...@redhat.com
---
 external-module-compat-comm.h | 9 +
 linux | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/external-module-compat-comm.h b/external-module-compat-comm.h
index 6866658..f0e1ce2 100644
--- a/external-module-compat-comm.h
+++ b/external-module-compat-comm.h
@@ -1440,6 +1440,15 @@ static inline void pci_set_dev_assigned(struct pci_dev 
*pdev)
pdev-dev_flags |= PCI_DEV_FLAGS_ASSIGNED;
 }
 
+#undef percpu_counter_init
+#define percpu_counter_init(fbc, value, gfp)\
+({  \
+static struct lock_class_key __key; \
+\
+__percpu_counter_init(fbc, value, __key);  \
+})
+#endif
+
 #if LINUX_VERSION_CODE  KERNEL_VERSION(3,17,0)
 void *get_xsave_addr(struct xsave_struct *xsave, int feature);
 #endif
diff --git a/linux b/linux
index da01e61..b2776bf 16
--- a/linux
+++ b/linux
@@ -1 +1 @@
-Subproject commit da01e61428aa2b5c424fddc11178498462d8c77f
+Subproject commit b2776bf7149bddd1f4161f14f79520f17fc1d71d
-- 
1.8.3.1


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RESEND PATCH v15 07/11] KVM: arm: page logging 2nd stage fault handling

2015-01-08 Thread Christoffer Dall
On Wed, Jan 07, 2015 at 05:43:18PM -0800, Mario Smarduch wrote:
 Hi Christoffer,
   before going through your comments, I discovered that
 in 3.18.0-rc2 - a generic __get_user_pages_fast()
 was implemented, now ARM picks this up. This causes
 gfn_to_pfn_prot() to return meaningful 'writable'
 value for a read fault, provided the region is writable.
 
 Prior to that the weak version returned 0 and 'writable'
 had no optimization effect to set pte/pmd - RW on
 a read fault.
 
 As a consequence dirty logging broke in 3.18, I was seeing
 weird but very intermittent issues. I just put in the
 additional few lines to fix it, prevent pte RW (only R) on
 read faults  while  logging writable region.
 
 On 01/07/2015 04:38 AM, Christoffer Dall wrote:
  On Wed, Dec 17, 2014 at 06:07:29PM -0800, Mario Smarduch wrote:
  This patch is a followup to v15 patch series, with following changes:
  - When clearing/dissolving a huge, PMD mark huge page range dirty, since
the state of whole range is unknown. After the huge page is dissolved 
dirty page logging is at page granularity.
  
  What is the sequence of events where you could have dirtied another page
  within the PMD range after the user initially requested dirty page
  logging?
 
 No there is none. My issue was the start point for tracking dirty pages
 and that would be second call to dirty log read. Not first
 call after initial write protect where any page in range can
 be assumed dirty. I'll remove this, not sure if there would be any
 use case to call dirty log only once.
 

Calling dirty log once can not give you anything meaningful, right?  You
must assume all memory is 'dirty' at this point, no?

-Christoffer
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/4] add trace_seq_buffer_ptr

2015-01-08 Thread Paolo Bonzini
These API changes were introduced in 3.19-rc1.  Update source pointer
since this is the only required change for 3.19-rc1.

Signed-off-by: Paolo Bonzini pbonz...@redhat.com
---
 external-module-compat-comm.h | 4 
 linux | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/external-module-compat-comm.h b/external-module-compat-comm.h
index f0e1ce2..66cfde2 100644
--- a/external-module-compat-comm.h
+++ b/external-module-compat-comm.h
@@ -1452,3 +1452,7 @@ static inline void pci_set_dev_assigned(struct pci_dev 
*pdev)
 #if LINUX_VERSION_CODE  KERNEL_VERSION(3,17,0)
 void *get_xsave_addr(struct xsave_struct *xsave, int feature);
 #endif
+
+#if LINUX_VERSION_CODE  KERNEL_VERSION(3,19,0)
+#define trace_seq_buffer_ptr(p) ((p)-buffer + (p)-len)
+#endif
diff --git a/linux b/linux
index b2776bf..97bf6af 16
--- a/linux
+++ b/linux
@@ -1 +1 @@
-Subproject commit b2776bf7149bddd1f4161f14f79520f17fc1d71d
+Subproject commit 97bf6af1f928216fd6c5a66e8a57bfa95a659672
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/4] stubs for xsavec support

2015-01-08 Thread Paolo Bonzini
These are needed for KVM changes in 3.18.

Recent kernels added a separate feature word for XSAVE features, and KVM's
CPUID code is relying on the new definition.  Except for cpu_has_xsaves,
it's never accessing the feature itself: wrap cpu_has_xsaves with
kvm_cpu_has_xsaves, and then there is no problem with out-of-bounds
accesses.

Signed-off-by: Paolo Bonzini pbonz...@redhat.com
---
 external-module-compat-comm.h |  4 
 external-module-compat.c  | 11 +++
 sync  | 14 --
 x86/external-module-compat.h  | 37 +
 4 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/external-module-compat-comm.h b/external-module-compat-comm.h
index c20b1ed..1717ba3 100644
--- a/external-module-compat-comm.h
+++ b/external-module-compat-comm.h
@@ -1424,3 +1424,7 @@ extern u64 kvm_get_boot_base_ns(struct timekeeper *tk);
 
 #undef is_zero_pfn
 #define is_zero_pfn(pfn) ((pfn) == page_to_pfn(ZERO_PAGE(0)))
+
+#if LINUX_VERSION_CODE  KERNEL_VERSION(3,17,0)
+void *get_xsave_addr(struct xsave_struct *xsave, int feature);
+#endif
diff --git a/external-module-compat.c b/external-module-compat.c
index 38717b6..068ab44 100644
--- a/external-module-compat.c
+++ b/external-module-compat.c
@@ -362,3 +362,14 @@ u64 kvm_get_boot_base_ns(struct timekeeper *tk)
 }
 #endif
 #endif
+
+#if LINUX_VERSION_CODE  KERNEL_VERSION(3,17,0)
+void *get_xsave_addr(struct xsave_struct *xsave, int feature)
+{
+   int index = fls64(feature) - 1;
+   u32 size, offset, ecx, edx;
+
+   cpuid_count(0xd, index, size, offset, ecx, edx);
+   return (u8 *)xsave + offset;
+}
+#endif
diff --git a/sync b/sync
index fff85f3..3086b70 100755
--- a/sync
+++ b/sync
@@ -44,8 +44,8 @@ def hack_content(fname, data):
 'set_desc_base set_desc_limit pvclock_vcpu_time_info tboot_enabled '
 'i387_fxsave_struct native_write_msr_safe xsave_struct '
 'fpu_alloc fpu_free fpu_restore_checking fpu_save_init fpu_finit '
-'load_gdt store_gdt xstate_size cpu_has_xsave __get_user_pages_fast '
-'set_64bit siginfo_t use_mm '
+'load_gdt store_gdt xstate_size cpu_has_xsave cpu_has_xsaves '
+'__get_user_pages_fast set_64bit siginfo_t use_mm '
 'unuse_mm request_threaded_irq init_fpu __this_cpu_read '
 '__this_cpu_write sigset_from_compat '
 'sched_info_on x86_pmu_capability perf_get_x86_pmu_capability '
@@ -299,6 +299,16 @@ def hack_content(fname, data):
 if line == '#endif' and finish_endif:
 w('#endif')
 finish_endif = False
+if match(r'xcomp_bv'):
+w('#if LINUX_VERSION_CODE = KERNEL_VERSION(3,17,0)')
+w(line)
+w('#else')
+# Will be under if (cpu_has_xsaves), which is always 0.  Just
+# replace with something that compiles, the C statement might
+# span more than one line.
+w('WARN(1, this should never happen),')
+w(sub(r'xcomp_bv', 'xstate_bv', line))
+line = '#endif'
 if match(r'tkr\.'):
 w('#if LINUX_VERSION_CODE = KERNEL_VERSION(3,17,0)')
 w(line)
diff --git a/x86/external-module-compat.h b/x86/external-module-compat.h
index dec53b6..87cf76a 100644
--- a/x86/external-module-compat.h
+++ b/x86/external-module-compat.h
@@ -428,6 +428,23 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long 
long *p)
 #define X86_FEATURE_MPX(9*32+14) /* Memory Protection 
Extension */
 #endif
 
+#if X86_FEATURE_XSAVEOPT  10 * 32
+#undef X86_FEATURE_XSAVEOPT
+#endif
+#define X86_FEATURE_XSAVEOPT   (10*32+0) /* XSAVEOPT instruction */
+
+#ifndef X86_FEATURE_XSAVEC
+#define X86_FEATURE_XSAVEC (10*32+1) /* XSAVEC instruction */
+#endif
+
+#ifndef X86_FEATURE_XGETBV1
+#define X86_FEATURE_XGETBV1(10*32+2) /* XCR1 register */
+#endif
+
+#ifndef X86_FEATURE_XSAVES
+#define X86_FEATURE_XSAVES (10*32+3) /* XSAVES instruction */
+#endif
+
 #ifndef MSR_AMD64_PATCH_LOADER
 #define MSR_AMD64_PATCH_LOADER 0xc0010020
 #endif
@@ -586,6 +603,10 @@ static inline void preempt_notifier_sys_exit(void) {}
 #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
 #endif
 
+#ifndef cpu_has_xsaves
+#define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES)
+#endif
+
 /* EFER_LMA and EFER_LME are missing in pre 2.6.24 i386 kernels */
 #ifndef EFER_LME
 #define _EFER_LME   8  /* Long mode enable */
@@ -1141,6 +1162,16 @@ static inline int kvm_init_fpu(struct task_struct *tsk)
 #define XSTATE_EAGER   (XSTATE_BNDREGS | XSTATE_BNDCSR)
 #endif
 
+#ifndef XSTATE_OPMASK
+#define XSTATE_OPMASK   0x20
+#define XSTATE_ZMM_Hi2560x40
+#define XSTATE_Hi16_ZMM 0x80
+#endif
+
+#ifndef XSTATE_AVX512
+#define XSTATE_AVX512   (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
+#endif
+
 #ifndef XSTATE_EXTEND_MASK
 #define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL  63)))
 #endif
@@ -1496,6 +1527,12 @@ static 

[PATCH 2/4] fixes for changes in the iommu and PCI APIs

2015-01-08 Thread Paolo Bonzini
These API changes were introduced in 3.18.

Signed-off-by: Paolo Bonzini pbonz...@redhat.com
---
 external-module-compat-comm.h | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/external-module-compat-comm.h b/external-module-compat-comm.h
index 1717ba3..6866658 100644
--- a/external-module-compat-comm.h
+++ b/external-module-compat-comm.h
@@ -14,6 +14,7 @@
 #include linux/kvm_para.h
 #include linux/kconfig.h
 #include linux/cpu.h
+#include linux/pci.h
 #include linux/time.h
 #include linux/kernel.h
 #include asm/processor.h
@@ -1425,6 +1426,20 @@ extern u64 kvm_get_boot_base_ns(struct timekeeper *tk);
 #undef is_zero_pfn
 #define is_zero_pfn(pfn) ((pfn) == page_to_pfn(ZERO_PAGE(0)))
 
+#if LINUX_VERSION_CODE  KERNEL_VERSION(3,18, 0)
+#define iommu_capable(dummy, cap) \
+iommu_domain_has_cap(kvm-arch.iommu_domain, cap)
+
+static inline void pci_clear_dev_assigned(struct pci_dev *pdev)
+{
+   pdev-dev_flags = ~PCI_DEV_FLAGS_ASSIGNED;
+}
+
+static inline void pci_set_dev_assigned(struct pci_dev *pdev)
+{
+   pdev-dev_flags |= PCI_DEV_FLAGS_ASSIGNED;
+}
+
 #if LINUX_VERSION_CODE  KERNEL_VERSION(3,17,0)
 void *get_xsave_addr(struct xsave_struct *xsave, int feature);
 #endif
-- 
1.8.3.1


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH kvm-kmod 0/4] Update to 3.19-rc1

2015-01-08 Thread Paolo Bonzini
The largest changes are in the XSAVE support.  Recent kernels added a
separate feature word for XSAVE features, and KVM's CPUID code is relying
on the new definition.  Except for cpu_has_xsaves, it's never accessing
the feature itself: wrap cpu_has_xsaves with kvm_cpu_has_xsaves, and
then there is no problem with out-of-bounds accesses.

Paolo Bonzini (4):
  stubs for xsavec support
  fixes for changes in the iommu and PCI APIs
  fixes for changes in the percpu counter API
  add trace_seq_buffer_ptr

 external-module-compat-comm.h | 32 
 external-module-compat.c  | 11 +++
 linux |  2 +-
 sync  | 14 --
 x86/external-module-compat.h  | 37 +
 5 files changed, 93 insertions(+), 3 deletions(-)

-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v15 10/11] KVM: arm/arm64: Enable Dirty Page logging for ARMv8

2015-01-08 Thread Christoffer Dall
On Wed, Jan 07, 2015 at 05:51:15PM -0800, Mario Smarduch wrote:
 On 01/07/2015 04:47 AM, Christoffer Dall wrote:
  On Sun, Dec 14, 2014 at 11:28:07PM -0800, Mario Smarduch wrote:
  This patch enables ARMv8 ditry page logging support. Plugs ARMv8 into 
  generic
  
 dirty
 yeah.
  
  layer through Kconfig symbol, and drops earlier ARM64 constraints to enable
  logging at architecture layer.
 
  Signed-off-by: Mario Smarduch m.smard...@samsung.com
  ---
   arch/arm/include/asm/kvm_host.h | 12 
   arch/arm/kvm/arm.c  |  4 
   arch/arm/kvm/mmu.c  | 19 +++
   arch/arm64/kvm/Kconfig  |  2 ++
   4 files changed, 13 insertions(+), 24 deletions(-)
 
  diff --git a/arch/arm/include/asm/kvm_host.h 
  b/arch/arm/include/asm/kvm_host.h
  index b138431..088ea87 100644
  --- a/arch/arm/include/asm/kvm_host.h
  +++ b/arch/arm/include/asm/kvm_host.h
  @@ -223,18 +223,6 @@ static inline void __cpu_init_hyp_mode(phys_addr_t 
  boot_pgd_ptr,
 kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr);
   }
   
  -/**
  - * kvm_flush_remote_tlbs() - flush all VM TLB entries
  - * @kvm:  pointer to kvm structure.
  - *
  - * Interface to HYP function to flush all VM TLB entries without address
  - * parameter.
  - */
  -static inline void kvm_flush_remote_tlbs(struct kvm *kvm)
  -{
  -  kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
  -}
  -
   static inline int kvm_arch_dev_ioctl_check_extension(long ext)
   {
 return 0;
  diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
  index 6e4290c..1b6577c 100644
  --- a/arch/arm/kvm/arm.c
  +++ b/arch/arm/kvm/arm.c
  @@ -740,7 +740,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
*/
   int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
   {
  -#ifdef CONFIG_ARM
 bool is_dirty = false;
 int r;
   
  @@ -753,9 +752,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct 
  kvm_dirty_log *log)
   
 mutex_unlock(kvm-slots_lock);
 return r;
  -#else /* arm64 */
  -  return -EINVAL;
  -#endif
   }
   
   static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
  diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
  index dc763bb..59003df 100644
  --- a/arch/arm/kvm/mmu.c
  +++ b/arch/arm/kvm/mmu.c
  @@ -52,11 +52,18 @@ static phys_addr_t hyp_idmap_vector;
   
   static bool kvm_get_logging_state(struct kvm_memory_slot *memslot)
   {
  -#ifdef CONFIG_ARM
 return !!memslot-dirty_bitmap;
  -#else
  -  return false;
  -#endif
  +}
  +
  +/**
  + * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
  + * @kvm:  pointer to kvm structure.
  + *
  + * Interface to HYP function to flush all VM TLB entries
  + */
  +inline void kvm_flush_remote_tlbs(struct kvm *kvm)
  
  did you intend for a non-staic inline here?
 
 Yes it's used in arm.c and mmu.c

then why inline?

I'm not a compiler expert by any measure, but poking around I'm pretty
sure the inline keyword in this context is useless.  See for example:
http://www.cs.nyu.edu/~xiaojian/bookmark/c_programming/Inline_Functions.htm

So I suggest either make it a normal stand-alone function or keep it as
duplicate static inlines in the header files if you're adamant about
this being inlined.

-Christoffer
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] x86_64,entry: Use sysret to return to userspace when possible

2015-01-08 Thread Borislav Petkov
On Thu, Jan 08, 2015 at 01:29:28PM +0100, Borislav Petkov wrote:
 Ok, dammit, it happened again:

Running -rc+ 2without your first two patches doesn't trigger it. Well,
I don't know what workload even triggered it, it used to happen during
system upgrade. I left the box without your patches to build the kernel
in a loop and went to lunch.

Now I'm back and it all still looks good.

I'll try running only with this second patch, i.e. the ret-to-user
SYSRET speedup thing. See what happens.

-- 
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH] KVM: introduce kvm_check_device

2015-01-08 Thread Andre Przywara
Hi Scott,

On 07/01/15 18:16, Scott Wood wrote:
 On Wed, 2015-01-07 at 18:11 +, Andre Przywara wrote:
 On 07/01/15 17:45, Scott Wood wrote:
 On Wed, 2015-01-07 at 10:55 +, Andre Przywara wrote:
 Hi Scott,

 thanks for looking at the patch.

 On 06/01/15 20:52, Scott Wood wrote:
 Out of curiosity, why do you need to test it from inside the kernel but
 outside kvm_main.c?

 I need it from arch/arm/kvm/arm.c or alternatively virt/kvm/arm/vgic.c.
 The problem is that while KVM_CREATE_DEVICE works fine with checking the
 availability of the requested device, KVM_CREATE_IRQCHIP does not - and
 the latter is handled in the arch specific parts of the code. At the
 moment the GIC_V2 is the only IRQ chip, so it's all or nothing right
 now. But very soon there will be a GIC_V3, with GIC_V2 compatibility not
 always being available, so KVM_CREATE_IRQCHIP may fail although there is
 an in-kernel IRQ chip available.
 Instead of hacking something up I thought it would be cleaner to use the
 existing framework and just map KVM_CREATE_IRQCHIP to
 KVM_CREATE_DEVICE(KVM_DEV_TYPE_ARM_VGIC_V2).

 In that case you'd need the full create_device functionality from
 arch/arm/kvm, not just testing whether a device type is present, right?

 Well, not really. On KVM_CREATE_IRQCHIP kvm_vgic_create is called, which
 is also what the KVM device .create function for GIC_V2 does.
 So yes, we don't fully use the KVM device framework, but just use the
 same functionality.
 Not sure if it would make sense to use more of the KVM device framework,
 as currently there is no issue with the current approach and I just need
 to know whether the GIC_V2 has been registered.
 
 If you're communicating privately with the vgic code to create the
 irqchip, can't you also communicate privately to see if that
 functionality is available?  I'm not opposed to this patch, but it seems
 odd to use the device framework just for testing whether some other
 interface is available.

Mainly because I wanted to avoid a solution that is private to the VGIC
and ARM. Also hacking this into the VGIC code looks much uglier than
simply using kvm_check_device_type(). Currently I just register GIC_V2
or GIC_V3 according to the hardware's capabilities. Now I have to
duplicate this decision making in the VGIC code again.

I now worked around the issue without this patch now, but it looks quite
ugly. I will send out a new version of the patch series tomorrow and
will put you on CC: for the respective patches.
So if in doubt we can continue this discussion on those patches later.

Cheers,
Andre.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 2/3] KVM: x86: add option to advance tscdeadline hrtimer expiration

2015-01-08 Thread Marcelo Tosatti
On Mon, Jan 05, 2015 at 07:12:36PM +0100, Radim Krcmar wrote:
 2014-12-23 15:58-0500, Marcelo Tosatti:
  For the hrtimer which emulates the tscdeadline timer in the guest,
  add an option to advance expiration, and busy spin on VM-entry waiting
  for the actual expiration time to elapse.
  
  This allows achieving low latencies in cyclictest (or any scenario 
  which requires strict timing regarding timer expiration).
  
  Reduces average cyclictest latency from 12us to 8us
  on Core i5 desktop.
  
  Note: this option requires tuning to find the appropriate value 
  for a particular hardware/guest combination. One method is to measure the 
  average delay between apic_timer_fn and VM-entry. 
  Another method is to start with 1000ns, and increase the value
  in say 500ns increments until avg cyclictest numbers stop decreasing.
  
  Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
 
 Reviewed-by: Radim Krčmář rkrc...@redhat.com
 
 (Other patches weren't touched, so my previous Reviewed-by holds.)
 
  +++ kvm/arch/x86/kvm/x86.c
  @@ -108,6 +108,10 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz)
   static u32 tsc_tolerance_ppm = 250;
   module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
   
  +/* lapic timer advance (tscdeadline mode only) in nanoseconds */
  +unsigned int lapic_timer_advance_ns = 0;
  +module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
  +
   static bool backwards_tsc_observed = false;
   
   #define KVM_NR_SHARED_MSRS 16
  @@ -5625,6 +5629,10 @@ static void kvm_timer_init(void)
  __register_hotcpu_notifier(kvmclock_cpu_notifier_block);
  cpu_notifier_register_done();
   
  +   if (check_tsc_unstable()  lapic_timer_advance_ns) {
  +   pr_info(kvm: unstable TSC, disabling 
  lapic_timer_advance_ns\n);
  +   lapic_timer_advance_ns = 0;
 
 Does unstable TSC invalidate this feature?
 (lapic_timer_advance_ns can be overridden, so we don't differentiate
  workflows that calibrate after starting with 0.)

Hum, i don't see why. Paolo?

 And cover letter is a bit misleading:  The condition does nothing to
 guarantee TSC based __delay() loop.  (Right now, __delay() = delay_tsc()
 whenever the hardware has TSC, regardless of stability, thus always.)

OK.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] arm/arm64: KVM: use kernel mapping to perform invalidation on page fault

2015-01-08 Thread Marc Zyngier
On 08/01/15 13:16, Peter Maydell wrote:
 On 8 January 2015 at 13:07, Marc Zyngier marc.zyng...@arm.com wrote:
 Can you remind me why it's OK not to flush the icache for an
 ASID tagged VIVT icache? Making this page coherent might actually
 be revealing a change in the instructions associated with the VA,
 mightn't it?

 ASID cached VIVT icaches are also VMID tagged. It is thus impossible for
 stale cache lines to come with a new page. And if by synchronizing the
 caches you obtain a different instruction stream, it means you've
 restored the wrong page.
 
 ...is that true even if the dirty data in the dcache comes from
 the userspace process doing DMA or writing the initial boot
 image or whatever?

We perform this on a page that is being brought in stage-2. Two cases:

- This is a page is mapped for the first time: the icache should be
invalid for this page (the guest should have invalidated it the first
place),

- This is a page that we bring back from swap: the page must match the
one that has been swapped out. If it has been DMA'ed in in the meantime,
then the guest surely has flushed its icache if it intends to branch to
it, hasn't it?

I have the feeling I'm missing something from your question...

M.
-- 
Jazz is not dead. It just smells funny...
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] arm/arm64: KVM: use kernel mapping to perform invalidation on page fault

2015-01-08 Thread Peter Maydell
On 8 January 2015 at 15:06, Marc Zyngier marc.zyng...@arm.com wrote:
 On 08/01/15 13:16, Peter Maydell wrote:
 ASID cached VIVT icaches are also VMID tagged. It is thus impossible for
 stale cache lines to come with a new page. And if by synchronizing the
 caches you obtain a different instruction stream, it means you've
 restored the wrong page.

 ...is that true even if the dirty data in the dcache comes from
 the userspace process doing DMA or writing the initial boot
 image or whatever?

 We perform this on a page that is being brought in stage-2. Two cases:

 - This is a page is mapped for the first time: the icache should be
 invalid for this page (the guest should have invalidated it the first
 place),

If this is the first instruction in the guest (ie we've just
(warm) reset the VM and are running the kernel as loaded into the guest
by QEMU/kvmtool) then the guest can't have invalidated the icache,
and QEMU can't do the invalidate because it doesn't have the vaddr
and VMID of the guest.

 - This is a page that we bring back from swap: the page must match the
 one that has been swapped out. If it has been DMA'ed in in the meantime,
 then the guest surely has flushed its icache if it intends to branch to
 it, hasn't it?

I agree that for the DMA case the guest will have done the invalidate.

-- PMM
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v15 11/11] KVM: arm/arm64: Add support to dissolve huge PUD

2015-01-08 Thread Mario Smarduch
On 01/08/2015 03:32 AM, Christoffer Dall wrote:
[...]
 Not sure myself what's the vision for PUD support.

 
 with 4-level paging on aarch64, we use PUDs but we haven't added any
 code to insert huge PUDs (only regular ones) on the stage-2 page tables,
 even if the host kernel happens to suddenly support huge PUDs for the
 stage-1 page tables, which is what I think we were trying to address.
 
 
 So I really think we can drop this whole patch.  As I said, really sorry
 about this one!
No problem I'll drop this patch.
 
 -Christoffer
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] KVM: x86: allow TSC deadline timer on all hosts

2015-01-08 Thread Radim Krčmář
Emulation does not utilize the feature.

Signed-off-by: Radim Krčmář rkrc...@redhat.com
---
 arch/x86/kvm/x86.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 933a373..ed879f1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2743,6 +2743,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
ext)
case KVM_CAP_READONLY_MEM:
case KVM_CAP_HYPERV_TIME:
case KVM_CAP_IOAPIC_POLARITY_IGNORED:
+   case KVM_CAP_TSC_DEADLINE_TIMER:
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3:
@@ -2781,9 +2782,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
ext)
case KVM_CAP_TSC_CONTROL:
r = kvm_has_tsc_control;
break;
-   case KVM_CAP_TSC_DEADLINE_TIMER:
-   r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);
-   break;
default:
r = 0;
break;
-- 
2.2.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v15 11/11] KVM: arm/arm64: Add support to dissolve huge PUD

2015-01-08 Thread Mario Smarduch
On 01/08/2015 03:32 AM, Christoffer Dall wrote:
 On Wed, Jan 07, 2015 at 07:01:10PM -0800, Mario Smarduch wrote:
 On 01/07/2015 05:05 AM, Christoffer Dall wrote:
 On Sun, Dec 14, 2014 at 11:28:08PM -0800, Mario Smarduch wrote:
 This patch adds the same support for PUD huge page as for PMD. Huge PUD is 
 write protected for initial memory region write protection. Code to 
 dissolve 
 huge PUD is supported in user_mem_abort(). At this time this code has not 
 been 
 tested, but similar approach to current ARMv8 page logging test is in work,
 limiting kernel memory and mapping in 1 or 2GB into Guest address space on 
 a 
 4k page/48 bit host, some host kernel test code needs to be added to detect
 page fault to this region and side step general processing. Also similar 
 to 
 PMD case all pages in range are marked dirty when PUD entry is cleared.

 the note about this code being untested shouldn't be part of the commit
 message but after the '---' separater or in the cover letter I think.

 Ah ok.


 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
  arch/arm/include/asm/kvm_mmu.h |  8 +
  arch/arm/kvm/mmu.c | 64 
 --
  arch/arm64/include/asm/kvm_mmu.h   |  9 +
  arch/arm64/include/asm/pgtable-hwdef.h |  3 ++
  4 files changed, 81 insertions(+), 3 deletions(-)

 diff --git a/arch/arm/include/asm/kvm_mmu.h 
 b/arch/arm/include/asm/kvm_mmu.h
 index dda0046..703d04d 100644
 --- a/arch/arm/include/asm/kvm_mmu.h
 +++ b/arch/arm/include/asm/kvm_mmu.h
 @@ -133,6 +133,14 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
return (pmd_val(*pmd)  L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
  }
  
 +static inline void kvm_set_s2pud_readonly(pud_t *pud)
 +{
 +}
 +
 +static inline bool kvm_s2pud_readonly(pud_t *pud)
 +{
 +  return false;
 +}
  
  /* Open coded p*d_addr_end that can deal with 64bit addresses */
  #define kvm_pgd_addr_end(addr, end)   
 \
 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index 59003df..35840fb 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -109,6 +109,55 @@ void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t 
 addr, pmd_t *pmd)
}
  }
  
 +/**
 +  * stage2_find_pud() - find a PUD entry
 +  * @kvm: pointer to kvm structure.
 +  * @addr:IPA address
 +  *
 +  * Return address of PUD entry or NULL if not allocated.
 +  */
 +static pud_t *stage2_find_pud(struct kvm *kvm, phys_addr_t addr)

 why can't you reuse stage2_get_pud here?

 stage2_get_* allocate intermediate tables, when they're called
 you know intermediate tables are needed to install a pmd or pte.
 But currently there is no way to tell we faulted in a PUD
 region, this code just checks if a PUD is set, and not
 allocate intermediate tables along the way.
 
 hmmm, but if we get here it means that we are faulting on an address, so
 we need to map something at that address regardless, so I don't see the
 problem in using stage2_get_pud.
 

 Overall not sure if this is in preparation for a new huge page (PUD sized)?
 Besides developing a custom test, not sure how to use this
 and determine we fault in a PUD region? Generic 'gup'
 code does handle PUDs but perhaps some arch. has PUD sized
 huge pages.

 
 When Marc and I discussed this we came to the conclusion that we wanted
 code to support this code path for when huge PUDs were suddently used,
 but now when I see the code, I am realizing that adding huge PUD support
 on the Stage-2 level requires a lot of changes to this file, so I really
 don't think we need to handle it at the point after all.
 

 +{
 +  pgd_t *pgd;
 +
 +  pgd = kvm-arch.pgd + pgd_index(addr);
 +  if (pgd_none(*pgd))
 +  return NULL;
 +
 +  return pud_offset(pgd, addr);
 +}
 +
 +/**
 + * stage2_dissolve_pud() - clear and flush huge PUD entry
 + * @kvm:  pointer to kvm structure.
 + * @addr  IPA
 + *
 + * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. 
 Marks all
 + * pages in the range dirty.
 + */
 +void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr)
 +{
 +  pud_t *pud;
 +  gfn_t gfn;
 +  long i;
 +
 +  pud = stage2_find_pud(kvm, addr);
 +  if (pud  !pud_none(*pud)  kvm_pud_huge(*pud)) {

 I'm just thinking here, why do we need to check if we get a valid pud
 back here, but we don't need the equivalent check in dissolve_pmd from
 patch 7?

 kvm_pud_huge() doesn't check bit 0 for invalid entry, but
 pud_none() is not the right way to check either, maybe pud_bad()
 first. Nothing is done in patch 7 since the pmd is retrieved from
 stage2_get_pmd().

 
 hmmm, but stage2_get_pmd() can return a NULL pointer if you have the
 IOMAP flag set...
 

 I think the rationale is that it should never happen because we never
 call these functions with the logging and iomap flags at the same
 time...

 I'm little lost here, not sure how it's related to above.
 But I think a VFIO device will have a memslot and
 it would be possible to enable logging. But 

Re: [PATCH v15 11/11] KVM: arm/arm64: Add support to dissolve huge PUD

2015-01-08 Thread Mario Smarduch
On 01/08/2015 03:32 AM, Christoffer Dall wrote:
 On Wed, Jan 07, 2015 at 07:01:10PM -0800, Mario Smarduch wrote:
 On 01/07/2015 05:05 AM, Christoffer Dall wrote:
 On Sun, Dec 14, 2014 at 11:28:08PM -0800, Mario Smarduch wrote:
 This patch adds the same support for PUD huge page as for PMD. Huge PUD is 
 write protected for initial memory region write protection. Code to 
 dissolve 
 huge PUD is supported in user_mem_abort(). At this time this code has not 
 been 
 tested, but similar approach to current ARMv8 page logging test is in work,
 limiting kernel memory and mapping in 1 or 2GB into Guest address space on 
 a 
 4k page/48 bit host, some host kernel test code needs to be added to detect
 page fault to this region and side step general processing. Also similar 
 to 
 PMD case all pages in range are marked dirty when PUD entry is cleared.

 the note about this code being untested shouldn't be part of the commit
 message but after the '---' separater or in the cover letter I think.

 Ah ok.


 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
  arch/arm/include/asm/kvm_mmu.h |  8 +
  arch/arm/kvm/mmu.c | 64 
 --
  arch/arm64/include/asm/kvm_mmu.h   |  9 +
  arch/arm64/include/asm/pgtable-hwdef.h |  3 ++
  4 files changed, 81 insertions(+), 3 deletions(-)

 diff --git a/arch/arm/include/asm/kvm_mmu.h 
 b/arch/arm/include/asm/kvm_mmu.h
 index dda0046..703d04d 100644
 --- a/arch/arm/include/asm/kvm_mmu.h
 +++ b/arch/arm/include/asm/kvm_mmu.h
 @@ -133,6 +133,14 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
return (pmd_val(*pmd)  L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
  }
  
 +static inline void kvm_set_s2pud_readonly(pud_t *pud)
 +{
 +}
 +
 +static inline bool kvm_s2pud_readonly(pud_t *pud)
 +{
 +  return false;
 +}
  
  /* Open coded p*d_addr_end that can deal with 64bit addresses */
  #define kvm_pgd_addr_end(addr, end)   
 \
 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index 59003df..35840fb 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -109,6 +109,55 @@ void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t 
 addr, pmd_t *pmd)
}
  }
  
 +/**
 +  * stage2_find_pud() - find a PUD entry
 +  * @kvm: pointer to kvm structure.
 +  * @addr:IPA address
 +  *
 +  * Return address of PUD entry or NULL if not allocated.
 +  */
 +static pud_t *stage2_find_pud(struct kvm *kvm, phys_addr_t addr)

 why can't you reuse stage2_get_pud here?

 stage2_get_* allocate intermediate tables, when they're called
 you know intermediate tables are needed to install a pmd or pte.
 But currently there is no way to tell we faulted in a PUD
 region, this code just checks if a PUD is set, and not
 allocate intermediate tables along the way.
 
 hmmm, but if we get here it means that we are faulting on an address, so
 we need to map something at that address regardless, so I don't see the
 problem in using stage2_get_pud.
 

 Overall not sure if this is in preparation for a new huge page (PUD sized)?
 Besides developing a custom test, not sure how to use this
 and determine we fault in a PUD region? Generic 'gup'
 code does handle PUDs but perhaps some arch. has PUD sized
 huge pages.

 
 When Marc and I discussed this we came to the conclusion that we wanted
 code to support this code path for when huge PUDs were suddently used,
 but now when I see the code, I am realizing that adding huge PUD support
 on the Stage-2 level requires a lot of changes to this file, so I really
 don't think we need to handle it at the point after all.
 

 +{
 +  pgd_t *pgd;
 +
 +  pgd = kvm-arch.pgd + pgd_index(addr);
 +  if (pgd_none(*pgd))
 +  return NULL;
 +
 +  return pud_offset(pgd, addr);
 +}
 +
 +/**
 + * stage2_dissolve_pud() - clear and flush huge PUD entry
 + * @kvm:  pointer to kvm structure.
 + * @addr  IPA
 + *
 + * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. 
 Marks all
 + * pages in the range dirty.
 + */
 +void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr)
 +{
 +  pud_t *pud;
 +  gfn_t gfn;
 +  long i;
 +
 +  pud = stage2_find_pud(kvm, addr);
 +  if (pud  !pud_none(*pud)  kvm_pud_huge(*pud)) {

 I'm just thinking here, why do we need to check if we get a valid pud
 back here, but we don't need the equivalent check in dissolve_pmd from
 patch 7?

 kvm_pud_huge() doesn't check bit 0 for invalid entry, but
 pud_none() is not the right way to check either, maybe pud_bad()
 first. Nothing is done in patch 7 since the pmd is retrieved from
 stage2_get_pmd().

 
 hmmm, but stage2_get_pmd() can return a NULL pointer if you have the
 IOMAP flag set...
 

 I think the rationale is that it should never happen because we never
 call these functions with the logging and iomap flags at the same
 time...

 I'm little lost here, not sure how it's related to above.
 But I think a VFIO device will have a memslot and
 it would be possible to enable logging. But 

Re: [PATCH] KVM: x86: allow TSC deadline timer on all hosts

2015-01-08 Thread Paolo Bonzini


On 08/01/2015 15:59, Radim Krčmář wrote:
 Emulation does not utilize the feature.

Indeed... nice :)

Applied to kvm/queue, thanks.

Paolo

 Signed-off-by: Radim Krčmář rkrc...@redhat.com
 ---
  arch/x86/kvm/x86.c | 4 +---
  1 file changed, 1 insertion(+), 3 deletions(-)
 
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index 933a373..ed879f1 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -2743,6 +2743,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
 ext)
   case KVM_CAP_READONLY_MEM:
   case KVM_CAP_HYPERV_TIME:
   case KVM_CAP_IOAPIC_POLARITY_IGNORED:
 + case KVM_CAP_TSC_DEADLINE_TIMER:
  #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
   case KVM_CAP_ASSIGN_DEV_IRQ:
   case KVM_CAP_PCI_2_3:
 @@ -2781,9 +2782,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
 ext)
   case KVM_CAP_TSC_CONTROL:
   r = kvm_has_tsc_control;
   break;
 - case KVM_CAP_TSC_DEADLINE_TIMER:
 - r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);
 - break;
   default:
   r = 0;
   break;
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RESEND PATCH v15 07/11] KVM: arm: page logging 2nd stage fault handling

2015-01-08 Thread Mario Smarduch
On 01/08/2015 02:45 AM, Christoffer Dall wrote:
 On Wed, Jan 07, 2015 at 05:43:18PM -0800, Mario Smarduch wrote:
 Hi Christoffer,
   before going through your comments, I discovered that
 in 3.18.0-rc2 - a generic __get_user_pages_fast()
 was implemented, now ARM picks this up. This causes
 gfn_to_pfn_prot() to return meaningful 'writable'
 value for a read fault, provided the region is writable.

 Prior to that the weak version returned 0 and 'writable'
 had no optimization effect to set pte/pmd - RW on
 a read fault.

 As a consequence dirty logging broke in 3.18, I was seeing
Correction on this, proper __get_user_pages_fast()
behavior exposed a bug in page logging code.

 weird but very intermittent issues. I just put in the
 additional few lines to fix it, prevent pte RW (only R) on
 read faults  while  logging writable region.

 On 01/07/2015 04:38 AM, Christoffer Dall wrote:
 On Wed, Dec 17, 2014 at 06:07:29PM -0800, Mario Smarduch wrote:
 This patch is a followup to v15 patch series, with following changes:
 - When clearing/dissolving a huge, PMD mark huge page range dirty, since
   the state of whole range is unknown. After the huge page is dissolved 
   dirty page logging is at page granularity.

 What is the sequence of events where you could have dirtied another page
 within the PMD range after the user initially requested dirty page
 logging?

 No there is none. My issue was the start point for tracking dirty pages
 and that would be second call to dirty log read. Not first
 call after initial write protect where any page in range can
 be assumed dirty. I'll remove this, not sure if there would be any
 use case to call dirty log only once.

 
 Calling dirty log once can not give you anything meaningful, right?  You
 must assume all memory is 'dirty' at this point, no?

There is the interval between KVM_MEM_LOG_DIRTY_PAGES and first
call to KVM_GET_DIRTY_LOG. Not sure of any use case, maybe enable
logging, wait a while do a dirty log read, disable logging.
Get an accumulated snapshot of dirty page activity.

- Mario

 
 -Christoffer
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v15 10/11] KVM: arm/arm64: Enable Dirty Page logging for ARMv8

2015-01-08 Thread Mario Smarduch
On 01/08/2015 02:56 AM, Christoffer Dall wrote:
 On Wed, Jan 07, 2015 at 05:51:15PM -0800, Mario Smarduch wrote:
 On 01/07/2015 04:47 AM, Christoffer Dall wrote:
 On Sun, Dec 14, 2014 at 11:28:07PM -0800, Mario Smarduch wrote:
 This patch enables ARMv8 ditry page logging support. Plugs ARMv8 into 
 generic

dirty
 yeah.

 layer through Kconfig symbol, and drops earlier ARM64 constraints to enable
 logging at architecture layer.

 Signed-off-by: Mario Smarduch m.smard...@samsung.com
 ---
  arch/arm/include/asm/kvm_host.h | 12 
  arch/arm/kvm/arm.c  |  4 
  arch/arm/kvm/mmu.c  | 19 +++
  arch/arm64/kvm/Kconfig  |  2 ++
  4 files changed, 13 insertions(+), 24 deletions(-)

 diff --git a/arch/arm/include/asm/kvm_host.h 
 b/arch/arm/include/asm/kvm_host.h
 index b138431..088ea87 100644
 --- a/arch/arm/include/asm/kvm_host.h
 +++ b/arch/arm/include/asm/kvm_host.h
 @@ -223,18 +223,6 @@ static inline void __cpu_init_hyp_mode(phys_addr_t 
 boot_pgd_ptr,
kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr);
  }
  
 -/**
 - * kvm_flush_remote_tlbs() - flush all VM TLB entries
 - * @kvm:  pointer to kvm structure.
 - *
 - * Interface to HYP function to flush all VM TLB entries without address
 - * parameter.
 - */
 -static inline void kvm_flush_remote_tlbs(struct kvm *kvm)
 -{
 -  kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
 -}
 -
  static inline int kvm_arch_dev_ioctl_check_extension(long ext)
  {
return 0;
 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 index 6e4290c..1b6577c 100644
 --- a/arch/arm/kvm/arm.c
 +++ b/arch/arm/kvm/arm.c
 @@ -740,7 +740,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
   */
  int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
  {
 -#ifdef CONFIG_ARM
bool is_dirty = false;
int r;
  
 @@ -753,9 +752,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct 
 kvm_dirty_log *log)
  
mutex_unlock(kvm-slots_lock);
return r;
 -#else /* arm64 */
 -  return -EINVAL;
 -#endif
  }
  
  static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index dc763bb..59003df 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -52,11 +52,18 @@ static phys_addr_t hyp_idmap_vector;
  
  static bool kvm_get_logging_state(struct kvm_memory_slot *memslot)
  {
 -#ifdef CONFIG_ARM
return !!memslot-dirty_bitmap;
 -#else
 -  return false;
 -#endif
 +}
 +
 +/**
 + * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
 + * @kvm:  pointer to kvm structure.
 + *
 + * Interface to HYP function to flush all VM TLB entries
 + */
 +inline void kvm_flush_remote_tlbs(struct kvm *kvm)

 did you intend for a non-staic inline here?

 Yes it's used in arm.c and mmu.c
 
 then why inline?
 
 I'm not a compiler expert by any measure, but poking around I'm pretty
 sure the inline keyword in this context is useless.  See for example:
 http://www.cs.nyu.edu/~xiaojian/bookmark/c_programming/Inline_Functions.htm
 
 So I suggest either make it a normal stand-alone function or keep it as
 duplicate static inlines in the header files if you're adamant about
 this being inlined.

Sorry about that, should have given this a closer look, this
shouldn't require a turnaround from you.

 
 -Christoffer
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/4] mm: Correct ordering of *_clear_flush_young_notify

2015-01-08 Thread Andrea Arcangeli
On Thu, Jan 08, 2015 at 11:59:06AM +, Marc Zyngier wrote:
 From: Steve Capper steve.cap...@linaro.org
 
 ptep_clear_flush_young_notify and pmdp_clear_flush_young_notify both
 call the notifiers *after* the pte/pmd has been made young.
 

On x86 on EPT without hardware access bit (!shadow_accessed_mask),
we'll trigger a KVM page fault (gup_fast) which would mark the page
referenced to give it higher priority in the LRU (or set the accessed
bit if it's a THP).

If we drop the KVM shadow pagetable before clearing the accessed bit
in the linux pte, there's a window where we won't set the young bit
for THP. For non-THP it's less of an issue because gup_fast calls
mark_page_accessed which rolls the lrus and sets the referenced bit in
the struct page, so the effect of mark_page_accessed doesn't get
lost when the linux pte accessed bit is cleared.

We could also consider using mark_page_accessed in
follow_trans_huge_pmd to workaround the problem.  I think setting the
young bit in gup_fast is correct and would be more similar to a real
CPU access (which is what gup_fast simulates anyway) so below patch
literally is introducing a race condition even if it's going to be
lost in the noise and it's not a problem.

 This can cause problems with KVM that relies on being able to block
 MMU notifiers when carrying out maintenance of second stage
 descriptors.
 
 This patch ensures that the MMU notifiers are called before ptes and
 pmds are made old.

Unfortunately I don't understand why this is needed.

The only difference this can make to KVM is that without the patch,
kvm_age_rmapp is called while the linux pte is less likely to have the
accessed bit set (current behavior). It can still be set by hardware
through another CPU touching the memory before the mmu notifier is
invoked.

With the patch the linux pte is more likely to have the accessed bit
set as it's not cleared before calling the mmu notifier.

In both cases (at least in x86 where the accessed bit is always set in
hardware) the accessed bit may or may not be set. The pte can not
otherwise change as it's called with the PT lock.

So again it looks a noop and it introduces a mostly theoretical race
condition for THP young bit in the linux pte with EPT and
!shadow_accessed_mask.

Clearly there must be some arm obscure detail I'm not aware of that
makes this helpful but the description in commit header isn't enough
to get what's up with blocking mmu notifiers or such. Could you
elaborate?

Thanks!
Andrea

 
 Signed-off-by: Steve Capper steve.cap...@linaro.org
 ---
  include/linux/mmu_notifier.h | 8 
  1 file changed, 4 insertions(+), 4 deletions(-)
 
 diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
 index 95243d2..c454c76 100644
 --- a/include/linux/mmu_notifier.h
 +++ b/include/linux/mmu_notifier.h
 @@ -290,11 +290,11 @@ static inline void mmu_notifier_mm_destroy(struct 
 mm_struct *mm)
   int __young;\
   struct vm_area_struct *___vma = __vma;  \
   unsigned long ___address = __address;   \
 - __young = ptep_clear_flush_young(___vma, ___address, __ptep);   \
 - __young |= mmu_notifier_clear_flush_young(___vma-vm_mm,\
 + __young = mmu_notifier_clear_flush_young(___vma-vm_mm, \
 ___address,   \
 ___address +  \
   PAGE_SIZE); \
 + __young |= ptep_clear_flush_young(___vma, ___address, __ptep);  \
   __young;\
  })
  
 @@ -303,11 +303,11 @@ static inline void mmu_notifier_mm_destroy(struct 
 mm_struct *mm)
   int __young;\
   struct vm_area_struct *___vma = __vma;  \
   unsigned long ___address = __address;   \
 - __young = pmdp_clear_flush_young(___vma, ___address, __pmdp);   \
 - __young |= mmu_notifier_clear_flush_young(___vma-vm_mm,\
 + __young = mmu_notifier_clear_flush_young(___vma-vm_mm, \
 ___address,   \
 ___address +  \
   PMD_SIZE);  \
 + __young |= pmdp_clear_flush_young(___vma, ___address, __pmdp);  \
   __young;\
  })
  
 -- 
 2.1.4
 
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at