date:20120921

Changlog:
changes from Avi's comments:
 - comment for FNAME(fetch)
 - add annotations (__acquires, __releases) for page_fault_start and
   page_fault_end

changes from Marcelo's comments:
 - remove mmu_is_invalid
 - make release noslot pfn path more readable

The last patch which introduces page_fault_start and page_fault_end is
controversial, i hope we can try it since it wrap the ugly pfn release
path up, but i respect your idea. :)

Release pfn in the mmu code is little special for we allow no-slot pfn
go to spte walk on page fault path, that means, on page fault fail path,
we can not directly call kvm_release_pfn_clean.

This patchset fixes the bug which release no-slot pfn on fail path and
clean up all the paths where kvm_release_pfn_clean is called

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 1/7] KVM: MMU: fix release noslot pfn

We can not directly call kvm_release_pfn_clean to release the pfn
since we can meet noslot pfn which is used to cache mmio info into
spte

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/mmu.c |6 --
 arch/x86/kvm/paging_tmpl.h |6 --
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index aa0b469..0f56169 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2877,7 +2877,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, 
u32 error_code,

 out_unlock:
spin_unlock(vcpu-kvm-mmu_lock);
-   kvm_release_pfn_clean(pfn);
+   if (likely(!is_noslot_pfn(pfn)))
+   kvm_release_pfn_clean(pfn);
return 0;
 }

@@ -3345,7 +3346,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t 
gpa, u32 error_code,

 out_unlock:
spin_unlock(vcpu-kvm-mmu_lock);
-   kvm_release_pfn_clean(pfn);
+   if (likely(!is_noslot_pfn(pfn)))
+   kvm_release_pfn_clean(pfn);
return 0;
 }

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index bf8c42b..9ce6bc0 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -544,7 +544,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 out_gpte_changed:
if (sp)
kvm_mmu_put_page(sp, it.sptep);
-   kvm_release_pfn_clean(pfn);
+   if (likely(!is_noslot_pfn(pfn)))
+   kvm_release_pfn_clean(pfn);
return NULL;
 }

@@ -645,7 +646,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t 
addr, u32 error_code,

 out_unlock:
spin_unlock(vcpu-kvm-mmu_lock);
-   kvm_release_pfn_clean(pfn);
+   if (likely(!is_noslot_pfn(pfn)))
+   kvm_release_pfn_clean(pfn);
return 0;
 }

-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 2/7] KVM: MMU: remove mmu_is_invalid

Remove mmu_is_invalid and use is_invalid_pfn instead

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/mmu.c |5 -
 arch/x86/kvm/paging_tmpl.h |4 ++--
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0f56169..3e9728b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2700,11 +2700,6 @@ static void transparent_hugepage_adjust(struct kvm_vcpu 
*vcpu,
}
 }

-static bool mmu_invalid_pfn(pfn_t pfn)
-{
-   return unlikely(is_invalid_pfn(pfn));
-}
-
 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
pfn_t pfn, unsigned access, int *ret_val)
 {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 9ce6bc0..b400761 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -370,7 +370,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct 
kvm_mmu_page *sp,
pgprintk(%s: gpte %llx spte %p\n, __func__, (u64)gpte, spte);
pte_access = sp-role.access  FNAME(gpte_access)(vcpu, gpte, true);
pfn = gfn_to_pfn_atomic(vcpu-kvm, gpte_to_gfn(gpte));
-   if (mmu_invalid_pfn(pfn))
+   if (is_invalid_pfn(pfn))
return;

/*
@@ -446,7 +446,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, 
struct guest_walker *gw,
gfn = gpte_to_gfn(gpte);
pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
  pte_access  ACC_WRITE_MASK);
-   if (mmu_invalid_pfn(pfn))
+   if (is_invalid_pfn(pfn))
break;

mmu_set_spte(vcpu, spte, sp-role.access, pte_access, 0, 0,
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 3/7] KVM: MMU: do not release pfn in mmu_set_spte

It helps us to cleanup release pfn in the later patches

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/mmu.c |   29 ++---
 arch/x86/kvm/paging_tmpl.h |   18 --
 2 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3e9728b..bc1cda4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2496,9 +2496,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 
*sptep,
rmap_recycle(vcpu, sptep, gfn);
}
}
-
-   if (!is_error_pfn(pfn))
-   kvm_release_pfn_clean(pfn);
 }

 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
@@ -2535,12 +2532,15 @@ static int direct_pte_prefetch_many(struct kvm_vcpu 
*vcpu,
if (ret = 0)
return -1;

-   for (i = 0; i  ret; i++, gfn++, start++)
+   for (i = 0; i  ret; i++, gfn++, start++) {
mmu_set_spte(vcpu, start, ACC_ALL,
 access, 0, 0, NULL,
 sp-role.level, gfn,
 page_to_pfn(pages[i]), true, true);

+   kvm_release_page_clean(pages[i]);
+   }
+
return 0;
 }

@@ -2858,23 +2858,22 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t 
v, u32 error_code,
return r;

spin_lock(vcpu-kvm-mmu_lock);
-   if (mmu_notifier_retry(vcpu, mmu_seq))
+   if (mmu_notifier_retry(vcpu, mmu_seq)) {
+   r = 0;
goto out_unlock;
+   }
+
kvm_mmu_free_some_pages(vcpu);
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, gfn, pfn, level);
r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
 prefault);
-   spin_unlock(vcpu-kvm-mmu_lock);
-
-
-   return r;

 out_unlock:
spin_unlock(vcpu-kvm-mmu_lock);
if (likely(!is_noslot_pfn(pfn)))
kvm_release_pfn_clean(pfn);
-   return 0;
+   return r;
 }


@@ -3328,22 +3327,22 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t 
gpa, u32 error_code,
return r;

spin_lock(vcpu-kvm-mmu_lock);
-   if (mmu_notifier_retry(vcpu, mmu_seq))
+   if (mmu_notifier_retry(vcpu, mmu_seq)) {
+   r = 0;
goto out_unlock;
+   }
+
kvm_mmu_free_some_pages(vcpu);
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, gfn, pfn, level);
r = __direct_map(vcpu, gpa, write, map_writable,
 level, gfn, pfn, prefault);
-   spin_unlock(vcpu-kvm-mmu_lock);
-
-   return r;

 out_unlock:
spin_unlock(vcpu-kvm-mmu_lock);
if (likely(!is_noslot_pfn(pfn)))
kvm_release_pfn_clean(pfn);
-   return 0;
+   return r;
 }

 static void nonpaging_free(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index b400761..56f8085 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -380,6 +380,9 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct 
kvm_mmu_page *sp,
mmu_set_spte(vcpu, spte, sp-role.access, pte_access, 0, 0,
 NULL, PT_PAGE_TABLE_LEVEL,
 gpte_to_gfn(gpte), pfn, true, true);
+
+   if (likely(!is_noslot_pfn(pfn)))
+   kvm_release_pfn_clean(pfn);
 }

 static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
@@ -452,6 +455,9 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, 
struct guest_walker *gw,
mmu_set_spte(vcpu, spte, sp-role.access, pte_access, 0, 0,
 NULL, PT_PAGE_TABLE_LEVEL, gfn,
 pfn, true, true);
+
+   if (likely(!is_noslot_pfn(pfn)))
+   kvm_release_pfn_clean(pfn);
}
 }

@@ -544,8 +550,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 out_gpte_changed:
if (sp)
kvm_mmu_put_page(sp, it.sptep);
-   if (likely(!is_noslot_pfn(pfn)))
-   kvm_release_pfn_clean(pfn);
+
return NULL;
 }

@@ -625,8 +630,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t 
addr, u32 error_code,
return r;

spin_lock(vcpu-kvm-mmu_lock);
-   if (mmu_notifier_retry(vcpu, mmu_seq))
+   if (mmu_notifier_retry(vcpu, mmu_seq)) {
+   r = 0;
goto out_unlock;
+   }

kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
kvm_mmu_free_some_pages(vcpu);
@@ -640,15 +647,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t 
addr, u32 error_code,

++vcpu-stat.pf_fixed;
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
-   spin_unlock(vcpu-kvm-mmu_lock);

-   return emulate;
+   r = emulate;

 out_unlock:
spin_unlock(vcpu-kvm-mmu_lock);
if

[PATCH v3 4/7] KVM: MMU: cleanup FNAME(page_fault)

Let it return emulate state instead of spte like __direct_map

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/paging_tmpl.h |   31 ---
 1 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 56f8085..6fd1c8b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -463,21 +463,21 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, 
struct guest_walker *gw,

 /*
  * Fetch a shadow pte for a specific level in the paging hierarchy.
+ * If the guest tries to write a write-protected page, we need to
+ * emulate this operation, return 1 to indicate this case.
  */
-static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 struct guest_walker *gw,
 int user_fault, int write_fault, int hlevel,
-int *emulate, pfn_t pfn, bool map_writable,
-bool prefault)
+pfn_t pfn, bool map_writable, bool prefault)
 {
-   unsigned access = gw-pt_access;
struct kvm_mmu_page *sp = NULL;
-   int top_level;
-   unsigned direct_access;
struct kvm_shadow_walk_iterator it;
+   unsigned direct_access, access = gw-pt_access;
+   int top_level, emulate = 0;

if (!is_present_gpte(gw-ptes[gw-level - 1]))
-   return NULL;
+   return 0;

direct_access = gw-pte_access;

@@ -541,17 +541,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t 
addr,

clear_sp_write_flooding_count(it.sptep);
mmu_set_spte(vcpu, it.sptep, access, gw-pte_access,
-user_fault, write_fault, emulate, it.level,
+user_fault, write_fault, emulate, it.level,
 gw-gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);

-   return it.sptep;
+   return emulate;

 out_gpte_changed:
if (sp)
kvm_mmu_put_page(sp, it.sptep);

-   return NULL;
+   return 0;
 }

 /*
@@ -574,8 +574,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t 
addr, u32 error_code,
int write_fault = error_code  PFERR_WRITE_MASK;
int user_fault = error_code  PFERR_USER_MASK;
struct guest_walker walker;
-   u64 *sptep;
-   int emulate = 0;
int r;
pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL;
@@ -639,17 +637,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t 
addr, u32 error_code,
kvm_mmu_free_some_pages(vcpu);
if (!force_pt_level)
transparent_hugepage_adjust(vcpu, walker.gfn, pfn, level);
-   sptep = FNAME(fetch)(vcpu, addr, walker, user_fault, write_fault,
-level, emulate, pfn, map_writable, prefault);
-   (void)sptep;
-   pgprintk(%s: shadow pte %p %llx emulate %d\n, __func__,
-sptep, *sptep, emulate);
+   r = FNAME(fetch)(vcpu, addr, walker, user_fault, write_fault,
+level, pfn, map_writable, prefault);

++vcpu-stat.pf_fixed;
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);

-   r = emulate;
-
 out_unlock:
spin_unlock(vcpu-kvm-mmu_lock);
if (likely(!is_noslot_pfn(pfn)))
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 5/7] KVM: MMU: introduce FNAME(prefetch_gpte)

The only difference between FNAME(update_pte) and FNAME(pte_prefetch)
is that the former is allowed to prefetch gfn from dirty logged slot,
so introduce a common function to prefetch spte

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/paging_tmpl.h |   58 ++-
 1 files changed, 24 insertions(+), 34 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6fd1c8b..5b1af72 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -356,33 +356,45 @@ no_present:
return true;
 }

-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- u64 *spte, const void *pte)
+static bool
+FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+u64 *spte, pt_element_t gpte, bool no_dirty_log)
 {
-   pt_element_t gpte;
unsigned pte_access;
+   gfn_t gfn;
pfn_t pfn;

-   gpte = *(const pt_element_t *)pte;
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
-   return;
+   return false;

pgprintk(%s: gpte %llx spte %p\n, __func__, (u64)gpte, spte);
+
+   gfn = gpte_to_gfn(gpte);
pte_access = sp-role.access  FNAME(gpte_access)(vcpu, gpte, true);
-   pfn = gfn_to_pfn_atomic(vcpu-kvm, gpte_to_gfn(gpte));
+   pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
+   no_dirty_log  (pte_access  ACC_WRITE_MASK));
if (is_invalid_pfn(pfn))
-   return;
+   return false;

/*
-* we call mmu_set_spte() with host_writable = true because that
-* vcpu-arch.update_pte.pfn was fetched from get_user_pages(write = 1).
+* we call mmu_set_spte() with host_writable = true because
+* pte_prefetch_gfn_to_pfn always gets a writable pfn.
 */
mmu_set_spte(vcpu, spte, sp-role.access, pte_access, 0, 0,
-NULL, PT_PAGE_TABLE_LEVEL,
-gpte_to_gfn(gpte), pfn, true, true);
+NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true);

if (likely(!is_noslot_pfn(pfn)))
kvm_release_pfn_clean(pfn);
+
+   return true;
+}
+
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ u64 *spte, const void *pte)
+{
+   pt_element_t gpte = *(const pt_element_t *)pte;
+
+   FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false);
 }

 static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
@@ -428,36 +440,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, 
struct guest_walker *gw,
spte = sp-spt + i;

for (i = 0; i  PTE_PREFETCH_NUM; i++, spte++) {
-   pt_element_t gpte;
-   unsigned pte_access;
-   gfn_t gfn;
-   pfn_t pfn;
-
if (spte == sptep)
continue;

if (is_shadow_present_pte(*spte))
continue;

-   gpte = gptep[i];
-
-   if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
-   continue;
-
-   pte_access = sp-role.access  FNAME(gpte_access)(vcpu, gpte,
- true);
-   gfn = gpte_to_gfn(gpte);
-   pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
- pte_access  ACC_WRITE_MASK);
-   if (is_invalid_pfn(pfn))
+   if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true))
break;
-
-   mmu_set_spte(vcpu, spte, sp-role.access, pte_access, 0, 0,
-NULL, PT_PAGE_TABLE_LEVEL, gfn,
-pfn, true, true);
-
-   if (likely(!is_noslot_pfn(pfn)))
-   kvm_release_pfn_clean(pfn);
}
 }

-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 6/7] KVM: MMU: move prefetch_invalid_gpte out of pagaing_tmp.h

The function does not depend on guest mmu mode, move it out from
paging_tmpl.h

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/mmu.c |   36 
 arch/x86/kvm/paging_tmpl.h |   24 ++--
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index bc1cda4..a455c0d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2503,6 +2503,14 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
mmu_free_roots(vcpu);
 }

+static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+   int bit7;
+
+   bit7 = (gpte  7)  1;
+   return (gpte  mmu-rsvd_bits_mask[bit7][level-1]) != 0;
+}
+
 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
 bool no_dirty_log)
 {
@@ -2515,6 +2523,26 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu 
*vcpu, gfn_t gfn,
return gfn_to_pfn_memslot_atomic(slot, gfn);
 }

+static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ u64 gpte)
+{
+   if (is_rsvd_bits_set(vcpu-arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+   goto no_present;
+
+   if (!is_present_gpte(gpte))
+   goto no_present;
+
+   if (!(gpte  PT_ACCESSED_MASK))
+   goto no_present;
+
+   return false;
+
+no_present:
+   drop_spte(vcpu-kvm, spte);
+   return true;
+}
+
 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
u64 *start, u64 *end)
@@ -3396,14 +3424,6 @@ static void paging_free(struct kvm_vcpu *vcpu)
nonpaging_free(vcpu);
 }

-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
-{
-   int bit7;
-
-   bit7 = (gpte  7)  1;
-   return (gpte  mmu-rsvd_bits_mask[bit7][level-1]) != 0;
-}
-
 static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
   int *nr_present)
 {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 5b1af72..298e5c2 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -336,26 +336,6 @@ static int FNAME(walk_addr_nested)(struct guest_walker 
*walker,
addr, access);
 }

-static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
-   struct kvm_mmu_page *sp, u64 *spte,
-   pt_element_t gpte)
-{
-   if (is_rsvd_bits_set(vcpu-arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
-   goto no_present;
-
-   if (!is_present_gpte(gpte))
-   goto no_present;
-
-   if (!(gpte  PT_ACCESSED_MASK))
-   goto no_present;
-
-   return false;
-
-no_present:
-   drop_spte(vcpu-kvm, spte);
-   return true;
-}
-
 static bool
 FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 u64 *spte, pt_element_t gpte, bool no_dirty_log)
@@ -364,7 +344,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct 
kvm_mmu_page *sp,
gfn_t gfn;
pfn_t pfn;

-   if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
+   if (prefetch_invalid_gpte(vcpu, sp, spte, gpte))
return false;

pgprintk(%s: gpte %llx spte %p\n, __func__, (u64)gpte, spte);
@@ -778,7 +758,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct 
kvm_mmu_page *sp)
  sizeof(pt_element_t)))
return -EINVAL;

-   if (FNAME(prefetch_invalid_gpte)(vcpu, sp, sp-spt[i], gpte)) {
+   if (prefetch_invalid_gpte(vcpu, sp, sp-spt[i], gpte)) {
vcpu-kvm-tlbs_dirty++;
continue;
}
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 7/7] KVM: MMU: introduce page_fault_start/page_fault_end

Wrap the common operations into these two functions

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/mmu.c |   55 
 arch/x86/kvm/paging_tmpl.h |   12 -
 2 files changed, 40 insertions(+), 27 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a455c0d..d75bf9a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2848,6 +2848,31 @@ exit:
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 gva_t gva, pfn_t *pfn, bool write, bool *writable);

+static bool
+page_fault_start(struct kvm_vcpu *vcpu, gfn_t *gfnp, pfn_t *pfnp, int *levelp,
+bool force_pt_level, unsigned long mmu_seq)
+   __acquires(vcpu-kvm-mmu_lock)
+{
+   spin_lock(vcpu-kvm-mmu_lock);
+   if (mmu_notifier_retry(vcpu, mmu_seq))
+   return false;
+
+   kvm_mmu_free_some_pages(vcpu);
+   if (likely(!force_pt_level))
+   transparent_hugepage_adjust(vcpu, gfnp, pfnp, levelp);
+
+   return true;
+}
+
+static void page_fault_end(struct kvm_vcpu *vcpu, pfn_t pfn)
+   __releases(vcpu-kvm-mmu_lock)
+{
+   spin_unlock(vcpu-kvm-mmu_lock);
+
+   if (likely(!is_noslot_pfn(pfn)))
+   kvm_release_pfn_clean(pfn);
+}
+
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
 gfn_t gfn, bool prefault)
 {
@@ -2885,22 +2910,17 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t 
v, u32 error_code,
if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, r))
return r;

-   spin_lock(vcpu-kvm-mmu_lock);
-   if (mmu_notifier_retry(vcpu, mmu_seq)) {
+   if (!page_fault_start(vcpu, gfn, pfn, level, force_pt_level,
+ mmu_seq)) {
r = 0;
-   goto out_unlock;
+   goto exit;
}

-   kvm_mmu_free_some_pages(vcpu);
-   if (likely(!force_pt_level))
-   transparent_hugepage_adjust(vcpu, gfn, pfn, level);
r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
 prefault);

-out_unlock:
-   spin_unlock(vcpu-kvm-mmu_lock);
-   if (likely(!is_noslot_pfn(pfn)))
-   kvm_release_pfn_clean(pfn);
+exit:
+   page_fault_end(vcpu, pfn);
return r;
 }

@@ -3354,22 +3374,17 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t 
gpa, u32 error_code,
if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, r))
return r;

-   spin_lock(vcpu-kvm-mmu_lock);
-   if (mmu_notifier_retry(vcpu, mmu_seq)) {
+   if (!page_fault_start(vcpu, gfn, pfn, level, force_pt_level,
+ mmu_seq)) {
r = 0;
-   goto out_unlock;
+   goto exit;
}

-   kvm_mmu_free_some_pages(vcpu);
-   if (likely(!force_pt_level))
-   transparent_hugepage_adjust(vcpu, gfn, pfn, level);
r = __direct_map(vcpu, gpa, write, map_writable,
 level, gfn, pfn, prefault);

-out_unlock:
-   spin_unlock(vcpu-kvm-mmu_lock);
-   if (likely(!is_noslot_pfn(pfn)))
-   kvm_release_pfn_clean(pfn);
+exit:
+   page_fault_end(vcpu, pfn);
return r;
 }

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 298e5c2..269116d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -597,10 +597,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t 
addr, u32 error_code,
walker.gfn, pfn, walker.pte_access, r))
return r;

-   spin_lock(vcpu-kvm-mmu_lock);
-   if (mmu_notifier_retry(vcpu, mmu_seq)) {
+   if (!page_fault_start(vcpu, walker.gfn, pfn, level,
+ force_pt_level, mmu_seq)) {
r = 0;
-   goto out_unlock;
+   goto exit;
}

kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
@@ -613,10 +613,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t 
addr, u32 error_code,
++vcpu-stat.pf_fixed;
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);

-out_unlock:
-   spin_unlock(vcpu-kvm-mmu_lock);
-   if (likely(!is_noslot_pfn(pfn)))
-   kvm_release_pfn_clean(pfn);
+exit:
+   page_fault_end(vcpu, pfn);
return r;
 }

-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 02/17] target-i386: Add missing kvm bits.

2012-09-21 Thread Igor Mammedov

On Thu, 20 Sep 2012 16:03:17 -0400
Don Slutz d...@cloudswitch.com wrote:

 Fix duplicate name (kvmclock = kvm_clock2) also.
 
 Signed-off-by: Don Slutz d...@cloudswitch.com
 ---
  target-i386/cpu.c |   12 
  1 files changed, 8 insertions(+), 4 deletions(-)
 
 diff --git a/target-i386/cpu.c b/target-i386/cpu.c
 index 0313cf5..5f9866a 100644
 --- a/target-i386/cpu.c
 +++ b/target-i386/cpu.c
 @@ -87,10 +87,14 @@ static const char *ext3_feature_name[] = {
  };
  
  static const char *kvm_feature_name[] = {
 -kvmclock, kvm_nopiodelay, kvm_mmu, kvmclock, kvm_asyncpf, 
 NULL, kvm_pv_eoi, NULL,
 -NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 -NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 -NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 +kvmclock, kvm_nopiodelay, kvm_mmu, kvm_clock2,
before patch if kvmclock is specified it would set 0 and 3 bits,
after patch only bit 0 is set.
Is it correct/expected behavior? if yes, please add rationale into patch
description.
 
 +kvm_asyncpf, kvm_steal_time, kvm_pv_eoi, NULL,
 +NULL, NULL, NULL, NULL,
 +NULL, NULL, NULL, NULL,
 +NULL, NULL, NULL, NULL,
 +NULL, NULL, NULL, NULL,
 +kvm_clock_stable, NULL, NULL, NULL,
 +NULL, NULL, NULL, NULL,
  };
  
  static const char *svm_feature_name[] = {
 -- 
 1.7.1
 


-- 
Regards,
  Igor
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [PATCH v3] kvm/fpu: Enable fully eager restore kvm FPU

2012-09-21 Thread Hao, Xudong

 -Original Message-
 From: Avi Kivity [mailto:a...@redhat.com]
 Sent: Thursday, September 20, 2012 5:20 PM
 To: Hao, Xudong
 Cc: Marcelo Tosatti; kvm@vger.kernel.org; Zhang, Xiantao
 Subject: Re: [PATCH v3] kvm/fpu: Enable fully eager restore kvm FPU
   On guest entry:
   if (!lazy_fpu_allowed(vcpu))
   kvm_x86_ops-fpu_activate(vcpu);

  But we already have that:

 if (vcpu-fpu_active)
 kvm_load_guest_fpu(vcpu);

  so why not manage fpu_active to be always set when needed?  I don't
 want
  more checks in the entry path.

  I means add fpu_active() in kvm_set_xcr(), not in guest entry. Then the
 fpu_active will be set always when guest initialize xstate.

  @@ -574,6 +574,9 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index,
 u64 xcr)
  kvm_inject_gp(vcpu, 0);
  return 1;
  }
  +   if (!lazy_fpu_allowed(vcpu))
  +   kvm_x86_ops-fpu_activate(vcpu);
  return 0;

 And of course on cr4 update.  So a function update_lazy_fpu() to be
 called from both places is needed.

Complete consideration, thanks.

So I will define a function update_lazy_fpu(), insert it into kvm_set_xcr() and 
handle_cr(). Comments?

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 0/6] Reduce compaction scanning and lock contention

2012-09-21 Thread Richard Davies

Hi Mel,

Thank you for this series. I have applied on clean 3.6-rc5 and tested, and
it works well for me - the lock contention is (still) gone and
isolate_freepages_block is much reduced.

Here is a typical test with these patches:

# grep -F '[k]' report | head -8
65.20% qemu-kvm  [kernel.kallsyms] [k] clear_page_c
 2.18% qemu-kvm  [kernel.kallsyms] [k] isolate_freepages_block
 1.56% qemu-kvm  [kernel.kallsyms] [k] _raw_spin_lock
 1.40% qemu-kvm  [kernel.kallsyms] [k] svm_vcpu_run
 1.38%  swapper  [kernel.kallsyms] [k] default_idle
 1.35% qemu-kvm  [kernel.kallsyms] [k] get_page_from_freelist
 0.74% ksmd  [kernel.kallsyms] [k] memcmp
 0.72% qemu-kvm  [kernel.kallsyms] [k] free_pages_prepare


I did manage to get a couple which were slightly worse, but nothing like as
bad as before. Here are the results:

# grep -F '[k]' report | head -8
45.60%   qemu-kvm  [kernel.kallsyms] [k] clear_page_c
11.26%   qemu-kvm  [kernel.kallsyms] [k] isolate_freepages_block
 3.21%   qemu-kvm  [kernel.kallsyms] [k] _raw_spin_lock
 2.27%   ksmd  [kernel.kallsyms] [k] memcmp
 2.02%swapper  [kernel.kallsyms] [k] default_idle
 1.58%   qemu-kvm  [kernel.kallsyms] [k] svm_vcpu_run
 1.30%   qemu-kvm  [kernel.kallsyms] [k] _raw_spin_lock_irqsave
 1.09%   qemu-kvm  [kernel.kallsyms] [k] get_page_from_freelist

# grep -F '[k]' report | head -8
61.29%   qemu-kvm  [kernel.kallsyms] [k] clear_page_c
 4.52%   qemu-kvm  [kernel.kallsyms] [k] _raw_spin_lock_irqsave
 2.64%   qemu-kvm  [kernel.kallsyms] [k] copy_page_c
 1.61%swapper  [kernel.kallsyms] [k] default_idle
 1.57%   qemu-kvm  [kernel.kallsyms] [k] _raw_spin_lock
 1.18%   qemu-kvm  [kernel.kallsyms] [k] get_page_from_freelist
 1.18%   qemu-kvm  [kernel.kallsyms] [k] isolate_freepages_block
 1.11%   qemu-kvm  [kernel.kallsyms] [k] svm_vcpu_run

I will follow up with the detailed traces for these three tests.

Thank you!

Richard.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 1/2] KVM: PPC: Book3S: Get/set guest SPRs using the GET/SET_ONE_REG interface

2012-09-21 Thread Alexander Graf


On 21.09.2012, at 07:44, Paul Mackerras wrote:

 This enables userspace to get and set various SPRs (special-purpose
 registers) using the KVM_[GS]ET_ONE_REG ioctls.  With this, userspace
 can get and set all the SPRs that are part of the guest state, either
 through the KVM_[GS]ET_REGS ioctls, the KVM_[GS]ET_SREGS ioctls, or
 the KVM_[GS]ET_ONE_REG ioctls.
 
 The SPRs that are added here are:
 
 - DABR:  Data address breakpoint register
 - DSCR:  Data stream control register
 - PURR:  Processor utilization of resources register
 - SPURR: Scaled PURR
 - DAR:   Data address register
 - DSISR: Data storage interrupt status register
 - AMR:   Authority mask register
 - UAMOR: User authority mask override register
 - MMCR0, MMCR1, MMCRA: Performance monitor unit control registers
 - PMC1..PMC8: Performance monitor unit counter registers
 
 In order to reduce code duplication between PR and HV KVM code, this
 moves the kvm_vcpu_ioctl_[gs]et_one_reg functions into book3s.c and
 centralizes the copying between user and kernel space there.  The
 registers that are handled differently between PR and HV, and those
 that exist only in one flavor, are handled in kvmppc_[gs]et_one_reg()
 functions that are specific to each flavor.
 
 Signed-off-by: Paul Mackerras pau...@samba.org
 ---
 v3: handle DAR and DSISR, plus copy to/from userspace, in common code
 
 Documentation/virtual/kvm/api.txt  |   19 +
 arch/powerpc/include/asm/kvm.h |   21 ++
 arch/powerpc/include/asm/kvm_ppc.h |7 
 arch/powerpc/kvm/book3s.c  |   74 +++
 arch/powerpc/kvm/book3s_hv.c   |   76 ++--
 arch/powerpc/kvm/book3s_pr.c   |   23 ++-
 6 files changed, 196 insertions(+), 24 deletions(-)
 
 diff --git a/Documentation/virtual/kvm/api.txt 
 b/Documentation/virtual/kvm/api.txt
 index 76a07a6..e4a2067 100644
 --- a/Documentation/virtual/kvm/api.txt
 +++ b/Documentation/virtual/kvm/api.txt
 @@ -1740,6 +1740,25 @@ registers, find a list below:
   PPC   | KVM_REG_PPC_IAC4  | 64
   PPC   | KVM_REG_PPC_DAC1  | 64
   PPC   | KVM_REG_PPC_DAC2  | 64
 +  PPC   | KVM_REG_PPC_DABR  | 64
 +  PPC   | KVM_REG_PPC_DSCR  | 64
 +  PPC   | KVM_REG_PPC_PURR  | 64
 +  PPC   | KVM_REG_PPC_SPURR | 64
 +  PPC   | KVM_REG_PPC_DAR   | 64
 +  PPC   | KVM_REG_PPC_DSISR  | 32
 +  PPC   | KVM_REG_PPC_AMR   | 64
 +  PPC   | KVM_REG_PPC_UAMOR | 64
 +  PPC   | KVM_REG_PPC_MMCR0 | 64
 +  PPC   | KVM_REG_PPC_MMCR1 | 64
 +  PPC   | KVM_REG_PPC_MMCRA | 64
 +  PPC   | KVM_REG_PPC_PMC1  | 32
 +  PPC   | KVM_REG_PPC_PMC2  | 32
 +  PPC   | KVM_REG_PPC_PMC3  | 32
 +  PPC   | KVM_REG_PPC_PMC4  | 32
 +  PPC   | KVM_REG_PPC_PMC5  | 32
 +  PPC   | KVM_REG_PPC_PMC6  | 32
 +  PPC   | KVM_REG_PPC_PMC7  | 32
 +  PPC   | KVM_REG_PPC_PMC8  | 32
 
 4.69 KVM_GET_ONE_REG
 
 diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
 index 3c14202..9557576 100644
 --- a/arch/powerpc/include/asm/kvm.h
 +++ b/arch/powerpc/include/asm/kvm.h
 @@ -338,5 +338,26 @@ struct kvm_book3e_206_tlb_params {
 #define KVM_REG_PPC_IAC4  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x5)
 #define KVM_REG_PPC_DAC1  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x6)
 #define KVM_REG_PPC_DAC2  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x7)
 +#define KVM_REG_PPC_DABR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8)
 +#define KVM_REG_PPC_DSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x9)
 +#define KVM_REG_PPC_PURR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa)
 +#define KVM_REG_PPC_SPURR(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb)
 +#define KVM_REG_PPC_DAR  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc)
 +#define KVM_REG_PPC_DSISR(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xd)
 +#define KVM_REG_PPC_AMR  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xe)
 +#define KVM_REG_PPC_UAMOR(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xf)
 +
 +#define KVM_REG_PPC_MMCR0(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x10)
 +#define KVM_REG_PPC_MMCR1(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x11)
 +#define KVM_REG_PPC_MMCRA(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12)
 +
 +#define KVM_REG_PPC_PMC1 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x18)
 +#define KVM_REG_PPC_PMC2 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x19)
 +#define KVM_REG_PPC_PMC3 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1a)
 +#define KVM_REG_PPC_PMC4 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1b)
 +#define KVM_REG_PPC_PMC5 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1c)
 +#define KVM_REG_PPC_PMC6 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1d)
 +#define KVM_REG_PPC_PMC7 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1e)
 +#define KVM_REG_PPC_PMC8 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1f)
 
 #endif /* __LINUX_KVM_POWERPC_H */
 diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
 b/arch/powerpc/include/asm/kvm_ppc.h
 index 2c94cb3..6002b0a 100644
 --- a/arch/powerpc/include/asm/kvm_ppc.h
 +++ b/arch/powerpc/include/asm/kvm_ppc.h
 @@ -196,6 +196,11

[Autotest][PATCH 1/5] Autotest: Add utils for OpenVSwitch patch

ForAllxx:
  run object method on every object in list

  ForAll[a,b,c].print()

Signed-off-by: Jiří Župka jzu...@redhat.com
---
 client/shared/base_utils.py |   81 +++---
 1 files changed, 67 insertions(+), 14 deletions(-)

diff --git a/client/shared/base_utils.py b/client/shared/base_utils.py
index 0734742..573b907 100644
--- a/client/shared/base_utils.py
+++ b/client/shared/base_utils.py
@@ -1224,6 +1224,56 @@ def system_output_parallel(commands, timeout=None, 
ignore_status=False,
 return out
 
 
+class ForAll(list):
+def __getattr__(self, name):
+def wrapper(*args, **kargs):
+return map(lambda o: o.__getattribute__(name)(*args, **kargs), 
self)
+return wrapper
+
+
+class ForAllP(list):
+
+Parallel version of ForAll
+
+def __getattr__(self, name):
+def wrapper(*args, **kargs):
+threads = []
+for o in self:
+threads.append(InterruptedThread(o.__getattribute__(name),
+args=args, kwargs=kargs))
+for t in threads:
+t.start()
+return map(lambda t: t.join(), threads)
+return wrapper
+
+
+class ForAllPSE(list):
+
+Parallel version of and suppress exception.
+
+def __getattr__(self, name):
+def wrapper(*args, **kargs):
+threads = []
+for o in self:
+threads.append(InterruptedThread(o.__getattribute__(name),
+args=args, kwargs=kargs))
+for t in threads:
+t.start()
+
+result = []
+for t in threads:
+ret = {}
+try:
+ret[return] = t.join()
+except Exception:
+ret[exception] = sys.exc_info()
+ret[args] = args
+ret[kargs] = kargs
+result.append(ret)
+return result
+return wrapper
+
+
 def etraceback(prep, exc_info):
 
 Enhanced Traceback formats traceback into lines prep: line\nname: line
@@ -1733,9 +1783,12 @@ def import_site_function(path, module, funcname, dummy, 
modulefile=None):
 return import_site_symbol(path, module, funcname, dummy, modulefile)
 
 
-def _get_pid_path(program_name):
-pid_files_dir = GLOBAL_CONFIG.get_config_value(SERVER, 'pid_files_dir',
-   default=)
+def get_pid_path(program_name, pid_files_dir=None):
+if pid_files_dir is None:
+pid_files_dir = GLOBAL_CONFIG.get_config_value(SERVER,
+   'pid_files_dir',
+   default=)
+
 if not pid_files_dir:
 base_dir = os.path.dirname(__file__)
 pid_path = os.path.abspath(os.path.join(base_dir, .., ..,
@@ -1746,25 +1799,25 @@ def _get_pid_path(program_name):
 return pid_path
 
 
-def write_pid(program_name):
+def write_pid(program_name, pid_files_dir=None):
 
 Try to drop program_name.pid in the main autotest directory.
 
 Args:
   program_name: prefix for file name
 
-pidfile = open(_get_pid_path(program_name), w)
+pidfile = open(get_pid_path(program_name, pid_files_dir), w)
 try:
 pidfile.write(%s\n % os.getpid())
 finally:
 pidfile.close()
 
 
-def delete_pid_file_if_exists(program_name):
+def delete_pid_file_if_exists(program_name, pid_files_dir=None):
 
 Tries to remove program_name.pid from the main autotest directory.
 
-pidfile_path = _get_pid_path(program_name)
+pidfile_path = get_pid_path(program_name, pid_files_dir)
 
 try:
 os.remove(pidfile_path)
@@ -1774,18 +1827,18 @@ def delete_pid_file_if_exists(program_name):
 raise
 
 
-def get_pid_from_file(program_name):
+def get_pid_from_file(program_name, pid_files_dir=None):
 
 Reads the pid from program_name.pid in the autotest directory.
 
 @param program_name the name of the program
 @return the pid if the file exists, None otherwise.
 
-pidfile_path = _get_pid_path(program_name)
+pidfile_path = get_pid_path(program_name, pid_files_dir)
 if not os.path.exists(pidfile_path):
 return None
 
-pidfile = open(_get_pid_path(program_name), 'r')
+pidfile = open(get_pid_path(program_name, pid_files_dir), 'r')
 
 try:
 try:
@@ -1808,27 +1861,27 @@ def get_process_name(pid):
 return get_field(read_file(/proc/%d/stat % pid), 1)[1:-1]
 
 
-def program_is_alive(program_name):
+def program_is_alive(program_name, pid_files_dir=None):
 
 Checks if the process is alive and not in Zombie state.
 
 @param program_name the name of the program
 @return True if still alive, False otherwise
 
-pid = get_pid_from_file(program_name)
+pid = get_pid_from_file(program_name, pid_files_dir)
 if pid is None:

[Autotest][PATCH 2/5] virt: Adds OpenVSwitch support to virt tests.

When autotest tries add tap to bridge then test recognize if
test is bridge is standard linux or OpenVSwitch.

And adds some utils for bridge manipulation.

Signed-off-by: Jiří Župka jzu...@redhat.com
---
 client/shared/openvswitch.py |  583 ++
 client/tests/virt/virttest/utils_misc.py |  473 +++-
 2 files changed, 1042 insertions(+), 14 deletions(-)
 create mode 100644 client/shared/openvswitch.py

diff --git a/client/shared/openvswitch.py b/client/shared/openvswitch.py
new file mode 100644
index 000..98dc5e9
--- /dev/null
+++ b/client/shared/openvswitch.py
@@ -0,0 +1,583 @@
+import logging, re, os, select, signal
+try:
+import autotest.common as common
+except ImportError:
+import common
+from autotest.client import utils, package
+from autotest.client.shared import error
+from autotest.client.shared.base_utils import VersionableClass
+
+
+class ServiceManagerInterface(VersionableClass):
+def __new__(cls, *args, **kargs):
+ServiceManagerInterface.master_class = ServiceManagerInterface
+return super(ServiceManagerInterface, cls).__new__(cls, *args, **kargs)
+
+@classmethod
+def get_version(cls):
+
+Get version of ServiceManager.
+@return: Version of ServiceManager.
+
+return open(/proc/1/comm, r).read().strip()
+
+def stop(self, service_name):
+raise NotImplementedError(Method 'stop' must be
+   implemented in child class)
+
+
+def start(self, service_name):
+raise NotImplementedError(Method 'start' must be
+   implemented in child class)
+
+
+def restart(self, service_name):
+raise NotImplementedError(Method 'restart' must be
+   implemented in child class)
+
+
+def status(self, service_name):
+raise NotImplementedError(Method 'status' must be
+   implemented in child class)
+
+
+class ServiceManagerSystemD(ServiceManagerInterface, VersionableClass):
+@classmethod
+def is_right_version(cls, version):
+if version == systemd:
+return True
+return False
+
+def stop(self, service_name):
+utils.run(systemctl stop %s.service % (service_name))
+
+
+def start(self, service_name):
+utils.run(systemctl start %s.service % (service_name))
+
+
+def restart(self, service_name):
+utils.run(systemctl restart %s.service % (service_name))
+
+
+def status(self, service_name):
+utils.run(systemctl show %s.service % (service_name))
+
+
+class ServiceManagerSysvinit(ServiceManagerInterface, VersionableClass):
+@classmethod
+def is_right_version(cls, version):
+if version == init:
+return True
+return False
+
+
+def stop(self, service_name):
+utils.run(/etc/init.d/%s stop % (service_name))
+
+
+def start(self, service_name):
+utils.run(/etc/init.d/%s start % (service_name))
+
+
+def restart(self, service_name):
+utils.run(/etc/init.d/%s restart % (service_name))
+
+
+class ServiceManager(ServiceManagerInterface):
+pass
+
+
+class PathComplete(object):
+def __init__(self, bindir):
+self.bindir = bindir
+
+
+def __call__(self, path):
+return os.path.join(self.bindir, path)
+
+
+class OpenVSwitchControl(object):
+
+Class select the best matches control class for installed version
+of OpenVSwitch.
+
+OpenVSwtich parameters are described in man ovs-vswitchd.conf.db
+
+def __new__(cls, db_path=None, db_socket=None, db_pidfile=None,
+ ovs_pidfile=None, dbschema=None, install_prefix=None):
+
+Makes initialization of OpenVSwitch.
+
+@param tmpdir: Tmp directory for save openvswitch test files.
+@param db_path: Path of OVS database.
+@param db_socket: Path of OVS db socket.
+@param db_pidfile: Path of OVS db ovsdb-server pid.
+@param ovs_pidfile: Path of OVS ovs-vswitchd pid.
+@param install_prefix: Path where is openvswitch installed.
+
+# if path is None set default path.
+if not install_prefix:
+install_prefix = /
+if not db_path:
+db_path = os.path.join(install_prefix,
+   /etc/openvswitch/conf.db)
+if not db_socket:
+db_socket = os.path.join(install_prefix,
+ /var/run/openvswitch/db.sock)
+if not db_pidfile:
+db_pidfile = os.path.join(install_prefix,
+  /var/run/openvswitch/ovsdb-server.pid)
+if not ovs_pidfile:
+ovs_pidfile = os.path.join(install_prefix,
+   /var/run/openvswitch/ovs-vswitchd.pid)
+if not dbschema:
+dbschema = os.path.join(install_prefix,
+

[Autotest][PATCH 3/5] virt: Adds functionality for vms.

Allow creating of machine with tap devices which are not
connected to bridge.
Add function for fill virtnet object with address.

Signed-off-by: Jiří Župka jzu...@redhat.com
---
 client/tests/virt/virttest/kvm_vm.py |9 +
 client/tests/virt/virttest/utils_misc.py |3 ++-
 client/tests/virt/virttest/virt_vm.py|   20 
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/client/tests/virt/virttest/kvm_vm.py 
b/client/tests/virt/virttest/kvm_vm.py
index 247f0eb..d3b4ede 100644
--- a/client/tests/virt/virttest/kvm_vm.py
+++ b/client/tests/virt/virttest/kvm_vm.py
@@ -949,7 +949,7 @@ class VM(virt_vm.BaseVM):
 qemu_cmd += add_name(hlp, name)
 # no automagic devices please
 defaults = params.get(defaults, no)
-if has_option(hlp,nodefaults) and defaults != yes:
+if has_option(hlp, nodefaults) and defaults != yes:
 qemu_cmd +=  -nodefaults
 # Add monitors
 for monitor_name in params.objects(monitors):
@@ -1065,7 +1065,7 @@ class VM(virt_vm.BaseVM):
 
 for nic in vm.virtnet:
 # setup nic parameters as needed
-nic = vm.add_nic(**dict(nic)) # add_netdev if netdev_id not set
+nic = vm.add_nic(**dict(nic))   # add_netdev if netdev_id not set
 # gather set values or None if unset
 vlan = int(nic.get('vlan'))
 netdev_id = nic.get('netdev_id')
@@ -2064,7 +2064,7 @@ class VM(virt_vm.BaseVM):
 nic.set_if_none('nettype', 'bridge')
 if nic.nettype == 'bridge': # implies tap
 # destination is required, hard-code reasonable default if unset
-nic.set_if_none('netdst', 'virbr0')
+# nic.set_if_none('netdst', 'virbr0')
 # tapfd allocated/set in activate because requires system resources
 nic.set_if_none('tapfd_id', utils_misc.generate_random_id())
 elif nic.nettype == 'user':
@@ -2142,7 +2142,8 @@ class VM(virt_vm.BaseVM):
 error.context(Raising bridge for  + msg_sfx + attach_cmd,
   logging.debug)
 # assume this will puke if netdst unset
-utils_misc.add_to_bridge(nic.ifname, nic.netdst)
+if not nic.netdst is None:
+utils_misc.add_to_bridge(nic.ifname, nic.netdst)
 elif nic.nettype == 'user':
 attach_cmd +=  user,name=%s % nic.ifname
 else: # unsupported nettype
diff --git a/client/tests/virt/virttest/utils_misc.py 
b/client/tests/virt/virttest/utils_misc.py
index e416551..b4799e7 100644
--- a/client/tests/virt/virttest/utils_misc.py
+++ b/client/tests/virt/virttest/utils_misc.py
@@ -719,7 +719,8 @@ class VirtIface(PropCan):
 Networking information for single guest interface and host connection.
 
 
-__slots__ = ['nic_name', 'mac', 'nic_model', 'ip', 'nettype', 'netdst']
+__slots__ = ['nic_name', 'g_nic_name', 'mac', 'nic_model', 'ip',
+ 'nettype', 'netdst']
 # Make sure first byte generated is always zero and it follows
 # the class definition.  This helps provide more predictable
 # addressing while avoiding clashes between multiple NICs.
diff --git a/client/tests/virt/virttest/virt_vm.py 
b/client/tests/virt/virttest/virt_vm.py
index 79dd08b..ff6a0f7 100644
--- a/client/tests/virt/virttest/virt_vm.py
+++ b/client/tests/virt/virttest/virt_vm.py
@@ -518,6 +518,7 @@ class BaseVM(object):
 # Make sure the IP address is assigned to one or more macs
 # for this guest
 macs = self.virtnet.mac_list()
+
 if not utils_misc.verify_ip_address_ownership(arp_ip, macs):
 raise VMAddressVerificationError(nic.mac, arp_ip)
 logging.debug('Found/Verified IP %s for VM %s NIC %s' % (
@@ -525,6 +526,25 @@ class BaseVM(object):
 return arp_ip
 
 
+def fill_addrs(self, addrs):
+
+Fill VM's nic address to the virtnet structure based on VM's address
+structure addrs.
+
+@param addrs: Dict of interfaces and address
+{if_name:{mac:['addrs',],
+ipv4:['addrs',],
+ipv6:['addrs',]},
+  ...}
+
+for virtnet in self.virtnet:
+for iface_name, iface in addrs.iteritems():
+if virtnet.mac in iface[mac]:
+virtnet.ip = {ipv4: iface[ipv4],
+  ipv6: iface[ipv6]}
+virtnet.g_nic_name = iface_name
+
+
 def get_port(self, port, nic_index=0):
 
 Return the port in host space corresponding to port in guest space.
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Autotest][PATCH 5/5] virt: Repair import path.

Signed-off-by: Jiří Župka jzu...@redhat.com
---
 client/tests/virt/kvm/control.parallel|2 +-
 client/tests/virt/virttest/libvirt_xml.py |2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/client/tests/virt/kvm/control.parallel 
b/client/tests/virt/kvm/control.parallel
index a33365b..70806c0 100644
--- a/client/tests/virt/kvm/control.parallel
+++ b/client/tests/virt/kvm/control.parallel
@@ -174,7 +174,7 @@ tests = list(parser.get_dicts())
 # -
 # Run the tests
 # -
-from autotest.client.virt import scheduler
+from autotest.client.tests.virt.virttest import scheduler
 from autotest.client import utils
 
 # total_cpus defaults to the number of CPUs reported by /proc/cpuinfo
diff --git a/client/tests/virt/virttest/libvirt_xml.py 
b/client/tests/virt/virttest/libvirt_xml.py
index 79fe87a..992bfc2 100644
--- a/client/tests/virt/virttest/libvirt_xml.py
+++ b/client/tests/virt/virttest/libvirt_xml.py
@@ -11,7 +11,7 @@
 
 import logging, os.path
 from autotest.client.shared import error, xml_utils
-from autotest.client.virt import libvirt_vm, virsh
+from virttest import libvirt_vm, virsh
 
 
 class LibvirtXMLError(Exception):
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 0/6] Reduce compaction scanning and lock contention

On Fri, Sep 21, 2012 at 10:13:33AM +0100, Richard Davies wrote:
 Hi Mel,
 
 Thank you for this series. I have applied on clean 3.6-rc5 and tested, and
 it works well for me - the lock contention is (still) gone and
 isolate_freepages_block is much reduced.
 

Excellent!

 Here is a typical test with these patches:
 
 # grep -F '[k]' report | head -8
 65.20% qemu-kvm  [kernel.kallsyms] [k] clear_page_c
  2.18% qemu-kvm  [kernel.kallsyms] [k] isolate_freepages_block
  1.56% qemu-kvm  [kernel.kallsyms] [k] _raw_spin_lock
  1.40% qemu-kvm  [kernel.kallsyms] [k] svm_vcpu_run
  1.38%  swapper  [kernel.kallsyms] [k] default_idle
  1.35% qemu-kvm  [kernel.kallsyms] [k] get_page_from_freelist
  0.74% ksmd  [kernel.kallsyms] [k] memcmp
  0.72% qemu-kvm  [kernel.kallsyms] [k] free_pages_prepare
 

Ok, so that is more or less acceptable. I would like to reduce the scanning
even further but I'll take this as a start -- largely because I do not have
any new good ideas on how it could be reduced further without incurring
a large cost in the page allocator :)

 I did manage to get a couple which were slightly worse, but nothing like as
 bad as before. Here are the results:
 
 # grep -F '[k]' report | head -8
 45.60%   qemu-kvm  [kernel.kallsyms] [k] clear_page_c
 11.26%   qemu-kvm  [kernel.kallsyms] [k] isolate_freepages_block
  3.21%   qemu-kvm  [kernel.kallsyms] [k] _raw_spin_lock
  2.27%   ksmd  [kernel.kallsyms] [k] memcmp
  2.02%swapper  [kernel.kallsyms] [k] default_idle
  1.58%   qemu-kvm  [kernel.kallsyms] [k] svm_vcpu_run
  1.30%   qemu-kvm  [kernel.kallsyms] [k] _raw_spin_lock_irqsave
  1.09%   qemu-kvm  [kernel.kallsyms] [k] get_page_from_freelist
 
 # grep -F '[k]' report | head -8
 61.29%   qemu-kvm  [kernel.kallsyms] [k] clear_page_c
  4.52%   qemu-kvm  [kernel.kallsyms] [k] _raw_spin_lock_irqsave
  2.64%   qemu-kvm  [kernel.kallsyms] [k] copy_page_c
  1.61%swapper  [kernel.kallsyms] [k] default_idle
  1.57%   qemu-kvm  [kernel.kallsyms] [k] _raw_spin_lock
  1.18%   qemu-kvm  [kernel.kallsyms] [k] get_page_from_freelist
  1.18%   qemu-kvm  [kernel.kallsyms] [k] isolate_freepages_block
  1.11%   qemu-kvm  [kernel.kallsyms] [k] svm_vcpu_run
 
 

Were the boot times acceptable even when these slightly worse figures
were recorded?

 I will follow up with the detailed traces for these three tests.
 
 Thank you!
 

Thank you for the detailed reporting and the testing, it's much
appreciated. I've already rebased the patches to Andrew's tree and tested
them overnight and the figures look good on my side. I'll update the
changelog and push them shortly.

-- 
Mel Gorman
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 0/6] Reduce compaction scanning and lock contention

2012-09-21 Thread Richard Davies

Mel Gorman wrote:
  I did manage to get a couple which were slightly worse, but nothing like as
  bad as before. Here are the results:
 
  # grep -F '[k]' report | head -8
  45.60%   qemu-kvm  [kernel.kallsyms] [k] clear_page_c
  11.26%   qemu-kvm  [kernel.kallsyms] [k] isolate_freepages_block
   3.21%   qemu-kvm  [kernel.kallsyms] [k] _raw_spin_lock
   2.27%   ksmd  [kernel.kallsyms] [k] memcmp
   2.02%swapper  [kernel.kallsyms] [k] default_idle
   1.58%   qemu-kvm  [kernel.kallsyms] [k] svm_vcpu_run
   1.30%   qemu-kvm  [kernel.kallsyms] [k] _raw_spin_lock_irqsave
   1.09%   qemu-kvm  [kernel.kallsyms] [k] get_page_from_freelist
 
  # grep -F '[k]' report | head -8
  61.29%   qemu-kvm  [kernel.kallsyms] [k] clear_page_c
   4.52%   qemu-kvm  [kernel.kallsyms] [k] _raw_spin_lock_irqsave
   2.64%   qemu-kvm  [kernel.kallsyms] [k] copy_page_c
   1.61%swapper  [kernel.kallsyms] [k] default_idle
   1.57%   qemu-kvm  [kernel.kallsyms] [k] _raw_spin_lock
   1.18%   qemu-kvm  [kernel.kallsyms] [k] get_page_from_freelist
   1.18%   qemu-kvm  [kernel.kallsyms] [k] isolate_freepages_block
   1.11%   qemu-kvm  [kernel.kallsyms] [k] svm_vcpu_run

 Were the boot times acceptable even when these slightly worse figures
 were recorded?

Yes, they were 10-20% slower as you might expect from the traces, rather
than a factor slower.

 Thank you for the detailed reporting and the testing, it's much
 appreciated. I've already rebased the patches to Andrew's tree and tested
 them overnight and the figures look good on my side. I'll update the
 changelog and push them shortly.

Great. On my side, I'm delighted that senior kernel developers such as you,
Rik and Avi took our bug report seriously and helped fix it!

Thank you,

Richard.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 1/2] KVM: PPC: Book3S: Get/set guest SPRs using the GET/SET_ONE_REG interface

2012-09-21 Thread Paul Mackerras

On Fri, Sep 21, 2012 at 11:15:51AM +0200, Alexander Graf wrote:
 
 On 21.09.2012, at 07:44, Paul Mackerras wrote:
 
  +union kvmppc_one_reg {
  +   u32 wval;
  +   u64 dval;
 
 Phew. Is this guaranteed to always pad on the right, rather than left?

Absolutely (for big-endian targets).  A union is basically a struct
with all the members at offset 0.  So we read N bytes into the union
and then access the N-byte member.

  +static int one_reg_size(u64 id)
  +{
  +   id = (id  KVM_REG_SIZE_MASK)  KVM_REG_SIZE_SHIFT;
  +   return 1  id;
  +}
 
 Rusty has a patch to export this as a generic function for all kvm targets. 
 Check out
 
   http://permalink.gmane.org/gmane.comp.emulators.kvm.devel/97553
 
 I actually like static inlines better, but we really should keep this code on 
 top level :).

Once Rusty's patch goes in I'd be happy to use it.

  +   switch (reg-id) {
  +   case KVM_REG_PPC_DAR:
  +   val.dval = vcpu-arch.shared-dar;
 
 Hmm. I would like to keep the amount of places who need to manually know 
 about the size of a register as low as possible. We do know the size of the 
 register already on in the REG id and in sizeof(register), right?

This code doesn't manually know about the size of the register.  It
could be that vcpu-arch.shared-dar is 4 bytes, or 2 bytes, instead
of 8 bytes, and that wouldn't affect this code.  All this code knows
is that the definition of KVM_REG_PPC_DAR includes the 8 byte
encoding.  And since that definition is -- or will be -- part of the
userspace ABI, it's not something that can be changed later, so I
don't see any problem with just using val.dval here.

 I think there will be valid scenarios for them to be of different size. For 
 example when a register suddenly increases in size. But the assignment should 
 always happen on the size the register id tells us.

It doesn't matter if a register changes in size, that doesn't affect
this code.  We may want to define an extra KVM_REG_PPC_* symbol if it
increases in size, but that would be a different symbol to the ones we
already have.  As I said, we couldn't just arbitrarily change the
existing symbol because that would break existing userspace programs.

In other words the assignment already does happen on the size the
register id tells us, with my code.

 So how about something like
 
 #define kvmppc_set_reg(id, val, reg) { \
   switch (one_reg_size(id)) { \
   case 4: val.wval = reg; break; \
   case 8: val.dval = reg; break; \
   default: BUG(); \
   } \
 }
 
 case KVM_REG_PPC_DAR:
   kvmppc_set_reg(reg-id, val, vcpu-arch.shared-dar);
   break;
 
 Maybe you can come up with something even easier though :).

Seems unnecessarily complex to me.  If we did something like that we
should do

  kvmppc_set_reg(KVM_REG_PPC_DAR, val, vcpu-arch.shared-dar);

and use a macro instead of one_reg_size() to give the compiler a
better chance to do constant folding and only compile in the branch of
the switch statement that we need.

 Btw, with such a scheme in place, we wouldn't even need the union. We could 
 just reserve a char array and cast it in the setter/getter functions.

I'm always wary of accessing a variable of one type as a different
type using a cast on its address, because of the C99 type-punning
rules.  It's probably OK with a char array, but I think a union is
cleaner, because you're explicitly stating that you want some storage
that can be accessed as different types.

Paul.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 0/6] Reduce compaction scanning and lock contention

On Fri, Sep 21, 2012 at 10:17:01AM +0100, Richard Davies wrote:
 Richard Davies wrote:
  I did manage to get a couple which were slightly worse, but nothing like as
  bad as before. Here are the results:
  
  # grep -F '[k]' report | head -8
  45.60%   qemu-kvm  [kernel.kallsyms] [k] clear_page_c
  11.26%   qemu-kvm  [kernel.kallsyms] [k] isolate_freepages_block
   3.21%   qemu-kvm  [kernel.kallsyms] [k] _raw_spin_lock
   2.27%   ksmd  [kernel.kallsyms] [k] memcmp
   2.02%swapper  [kernel.kallsyms] [k] default_idle
   1.58%   qemu-kvm  [kernel.kallsyms] [k] svm_vcpu_run
   1.30%   qemu-kvm  [kernel.kallsyms] [k] _raw_spin_lock_irqsave
   1.09%   qemu-kvm  [kernel.kallsyms] [k] get_page_from_freelist
 
 # 
 # captured on: Fri Sep 21 08:17:52 2012
 # os release : 3.6.0-rc5-elastic+
 # perf version : 3.5.2
 # arch : x86_64
 # nrcpus online : 16
 # nrcpus avail : 16
 # cpudesc : AMD Opteron(tm) Processor 6128
 # cpuid : AuthenticAMD,16,9,1
 # total memory : 131973276 kB
 # cmdline : /home/root/bin/perf record -g -a 
 # event : name = cycles, type = 0, config = 0x0, config1 = 0x0, config2 = 
 0x0, excl_usr = 0, excl_kern = 0, id = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 
 12, 13, 14, 15, 16 }
 # HEADER_CPU_TOPOLOGY info available, use -I to display
 # HEADER_NUMA_TOPOLOGY info available, use -I to display
 # 
 #
 # Samples: 283K of event 'cycles'
 # Event count (approx.): 109057976176
 #
 # OverheadCommand Shared Object   
Symbol
 #   .    
 ..
 #
 45.60%   qemu-kvm  [kernel.kallsyms] [k] clear_page_c 
  
  |
  --- clear_page_c
 |  
 |--93.35%-- do_huge_pmd_anonymous_page

This is unavoidable. If THP was disabled, the cost would still be
incurred, just on base pages instead of huge pages.

 SNIP
 11.26%   qemu-kvm  [kernel.kallsyms] [k] isolate_freepages_block  
  
  |
  --- isolate_freepages_block
  compaction_alloc
  migrate_pages
  compact_zone
  compact_zone_order
  try_to_compact_pages
  __alloc_pages_direct_compact
  __alloc_pages_nodemask
  alloc_pages_vma
  do_huge_pmd_anonymous_page

And this is showing that we're still spending a lot of time scanning
for free pages to isolate. I do not have a great idea on how this can be
reduced further without interfering with the page allocator.

One ok idea I considered in the past was using the buddy lists to find
free pages quickly but there is first the problem that the buddy lists
themselves may need to be searched and now that the zone lock is not held
during the scan it would be particularly difficult. The harder problem is
deciding when compaction finishes. I'll put more thought into it over
the weekend and see if something falls out but I'm not going to hold up
this series waiting for inspiration.

  3.21%   qemu-kvm  [kernel.kallsyms] [k] _raw_spin_lock   
  
  |
  --- _raw_spin_lock
 |  
 |--39.96%-- tdp_page_fault

Nothing very interesting here until...

 |--1.69%-- free_pcppages_bulk
 |  |  
 |  |--77.53%-- drain_pages
 |  |  |  
 |  |  |--95.77%-- drain_local_pages
 |  |  |  |  
 |  |  |  |--97.90%-- 
 generic_smp_call_function_interrupt
 |  |  |  |  
 smp_call_function_interrupt
 |  |  |  |  
 call_function_interrupt
 |  |  |  |  |  
 |  |  |  |  |--23.37%-- 
 kvm_vcpu_ioctl
 |  |  |  |  |  
 do_vfs_ioctl
 |  |  |  |  |  
 sys_ioctl
 |  |  |  |  |  
 system_call_fastpath
 |  |  |  |  |  
 ioctl
 |  |  |  |  |  |  
 
 |  |  |  |  |  
 |--97.22%-- 0x1010006
 |  |  |

Re: [PATCH v3 1/2] KVM: PPC: Book3S: Get/set guest SPRs using the GET/SET_ONE_REG interface

2012-09-21 Thread Alexander Graf


On 21.09.2012, at 11:52, Paul Mackerras wrote:

 On Fri, Sep 21, 2012 at 11:15:51AM +0200, Alexander Graf wrote:
 
 On 21.09.2012, at 07:44, Paul Mackerras wrote:
 
 +union kvmppc_one_reg {
 +   u32 wval;
 +   u64 dval;
 
 Phew. Is this guaranteed to always pad on the right, rather than left?
 
 Absolutely (for big-endian targets).  A union is basically a struct
 with all the members at offset 0.  So we read N bytes into the union
 and then access the N-byte member.
 
 +static int one_reg_size(u64 id)
 +{
 +   id = (id  KVM_REG_SIZE_MASK)  KVM_REG_SIZE_SHIFT;
 +   return 1  id;
 +}
 
 Rusty has a patch to export this as a generic function for all kvm targets. 
 Check out
 
  http://permalink.gmane.org/gmane.comp.emulators.kvm.devel/97553
 
 I actually like static inlines better, but we really should keep this code 
 on top level :).
 
 Once Rusty's patch goes in I'd be happy to use it.
 
 +   switch (reg-id) {
 +   case KVM_REG_PPC_DAR:
 +   val.dval = vcpu-arch.shared-dar;
 
 Hmm. I would like to keep the amount of places who need to manually know 
 about the size of a register as low as possible. We do know the size of the 
 register already on in the REG id and in sizeof(register), right?
 
 This code doesn't manually know about the size of the register.  It
 could be that vcpu-arch.shared-dar is 4 bytes, or 2 bytes, instead
 of 8 bytes, and that wouldn't affect this code.  All this code knows
 is that the definition of KVM_REG_PPC_DAR includes the 8 byte
 encoding.  And since that definition is -- or will be -- part of the
 userspace ABI, it's not something that can be changed later, so I
 don't see any problem with just using val.dval here.

It's not really a problem, it's really about redundancy. We write the same 
thing twice

  case id that encodes length:
val.length dependent value = foo;

And every time we do constructs like that, someone will one day come in and do 
fun like

  case id with length 8:
val.length 4 = foo;

So if we can get rid of that complete bug category from day 1, it makes me 
happy :).

 
 I think there will be valid scenarios for them to be of different size. For 
 example when a register suddenly increases in size. But the assignment 
 should always happen on the size the register id tells us.
 
 It doesn't matter if a register changes in size, that doesn't affect
 this code.  We may want to define an extra KVM_REG_PPC_* symbol if it
 increases in size, but that would be a different symbol to the ones we
 already have.  As I said, we couldn't just arbitrarily change the
 existing symbol because that would break existing userspace programs.
 
 In other words the assignment already does happen on the size the
 register id tells us, with my code.

Imagine we get new awesome 128bit GPRs now. The lower 64bit are still the same 
as they always were. The new machine is also backwards compatible, so we could 
be running 64bit code.

To transfer the bigger registers we could then do

case KVM_REG_PPC_GPR1:
case KVM_REG_PPC_GPR1_128:
kvmppc_set_reg(id, val, foo);

But the bit I care a lot more about is the redundancy as explained above.

 
 So how about something like
 
 #define kvmppc_set_reg(id, val, reg) { \
  switch (one_reg_size(id)) { \
  case 4: val.wval = reg; break; \
  case 8: val.dval = reg; break; \
  default: BUG(); \
  } \
 }
 
 case KVM_REG_PPC_DAR:
  kvmppc_set_reg(reg-id, val, vcpu-arch.shared-dar);
  break;
 
 Maybe you can come up with something even easier though :).
 
 Seems unnecessarily complex to me.  If we did something like that we
 should do
 
  kvmppc_set_reg(KVM_REG_PPC_DAR, val, vcpu-arch.shared-dar);
 
 and use a macro instead of one_reg_size() to give the compiler a
 better chance to do constant folding and only compile in the branch of
 the switch statement that we need.

What variable type would the 3rd argument be? Also, constant folding should 
still work with one_reg_size(), no?

 
 Btw, with such a scheme in place, we wouldn't even need the union. We could 
 just reserve a char array and cast it in the setter/getter functions.
 
 I'm always wary of accessing a variable of one type as a different
 type using a cast on its address, because of the C99 type-punning
 rules.  It's probably OK with a char array, but I think a union is
 cleaner, because you're explicitly stating that you want some storage
 that can be accessed as different types.

If unions are guaranteed to be laid out the way you describe above, that's 
perfectly fine for me. I just remember nightmares when I had to convert x86 
incomplete unions (the 2 members were of different size) to ppc, where they 
started at the reverse end. A scheme that isn't tied to BE would maybe allow us 
to share code with ARM and x86.


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/9] Reduce compaction scanning and lock contention

Hi Andrew,

Richard Davies and Shaohua Li have both reported lock contention
problems in compaction on the zone and LRU locks as well as
significant amounts of time being spent in compaction. This series
aims to reduce lock contention and scanning rates to reduce that CPU
usage. Richard reported at https://lkml.org/lkml/2012/9/21/91 that
this series made a big different to a problem he reported in August
(http://marc.info/?l=kvmm=134511507015614w=2).

Patches 1-3 reverts existing patches in Andrew's tree that get replaced
later in the series.

Patch 4 is a fix for c67fe375 (mm: compaction: Abort async compaction if
locks are contended or taking too long) to properly abort in all
cases when contention is detected.

Patch 5 defers acquiring the zone-lru_lock as long as possible.

Patch 6 defers acquiring the zone-lock as lock as possible.

Patch 7 reverts Rik's skip-free patches as the core concept gets
reimplemented later and the remaining patches are easier to
understand if this is reverted first.

Patch 8 adds a pageblock-skip bit to the pageblock flags to cache what
pageblocks should be skipped by the migrate and free scanners.
This drastically reduces the amount of scanning compaction has
to do.

Patch 9 reimplements something similar to Rik's idea except it uses the
pageblock-skip information to decide where the scanners should
restart from and does not need to wrap around.

I tested this on 3.6-rc6 + linux-next/akpm. Kernels tested were

akpm-20120920   3.6-rc6 + linux-next/akpm as of Septeber 20th, 2012
lesslockPatches 1-6
revert  Patches 1-7
cachefail   Patches 1-8
skipuseless Patches 1-9

Stress high-order allocation tests looked ok. Success rates are more or
less the same with the full series applied but there is an expectation that
there is less opportunity to race with other allocation requests if there is
less scanning. The time to complete the tests did not vary that much and are
uninteresting as were the vmstat statistics so I will not present them here.

Using ftrace I recorded how much scanning was done by compaction and got this

3.6.0-rc6 3.6.0-rc6   3.6.0-rc6  3.6.0-rc6 
3.6.0-rc6
akpm-20120920 lockless  revert-v2r2  cachefail 
skipuseless

Total   freescanned 360753976  515414028  565479007   17103281   
18916589 
Total   freeisolated  285242935973694048601 670493 
727840 
Total   freeefficiency0.0079%0.0070%0.0072%0.0392%
0.0385% 
Total   migrate scanned 247728664  822729112 1004645830   17946827   
14118903 
Total   migrate isolated  255532432459373437501 616359 
658616 
Total   migrate efficiency0.0103%0.0039%0.0034%0.0343%
0.0466% 

The efficiency is worthless because of the nature of the test and the
number of failures.  The really interesting point as far as this patch
series is concerned is the number of pages scanned. Note that reverting
Rik's patches massively increases the number of pages scanned indicating
that those patches really did make a difference to CPU usage.

However, caching what pageblocks should be skipped has a much higher
impact. With patches 1-8 applied, free page and migrate page scanning are
both reduced by 95% in comparison to the akpm kernel.  If the basic concept
of Rik's patches are implemened on top then scanning then the free scanner
barely changed but migrate scanning was further reduced. That said, tests
on 3.6-rc5 indicated that the last patch had greater impact than what was
measured here so it is a bit variable.

One way or the other, this series has a large impact on the amount of
scanning compaction does when there is a storm of THP allocations.

 include/linux/mmzone.h  |5 +-
 include/linux/pageblock-flags.h |   19 +-
 mm/compaction.c |  397 +--
 mm/internal.h   |   11 +-
 mm/page_alloc.c |6 +-
 5 files changed, 280 insertions(+), 158 deletions(-)

-- 
1.7.9.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/9] Revert mm: compaction: check lock contention first before taking lock

This reverts
mm-compaction-check-lock-contention-first-before-taking-lock.patch as it
is replaced by a later patch in the series.

Signed-off-by: Mel Gorman mgor...@suse.de
---
 mm/compaction.c |5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 3bb7232..4a77b4b 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -347,9 +347,8 @@ isolate_migratepages_range(struct zone *zone, struct 
compact_control *cc,
 
/* Time to isolate some pages for migration */
cond_resched();
-   locked = compact_trylock_irqsave(zone-lru_lock, flags, cc);
-   if (!locked)
-   return 0;
+   spin_lock_irqsave(zone-lru_lock, flags);
+   locked = true;
for (; low_pfn  end_pfn; low_pfn++) {
struct page *page;
 
-- 
1.7.9.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 3/9] Revert mm: compaction: abort compaction loop if lock is contended or run too long

This reverts
mm-compaction-abort-compaction-loop-if-lock-is-contended-or-run-too-long.patch
as it is replaced by a later patch in the series.

Signed-off-by: Mel Gorman mgor...@suse.de
---
 mm/compaction.c |   12 +---
 mm/internal.h   |2 +-
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 1c873bb..614f18b 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -70,7 +70,8 @@ static bool compact_checklock_irqsave(spinlock_t *lock, 
unsigned long *flags,
 
/* async aborts if taking too long or contended */
if (!cc-sync) {
-   cc-contended = true;
+   if (cc-contended)
+   *cc-contended = true;
return false;
}
 
@@ -685,7 +686,7 @@ static isolate_migrate_t isolate_migratepages(struct zone 
*zone,
 
/* Perform the isolation */
low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn);
-   if (!low_pfn || cc-contended)
+   if (!low_pfn)
return ISOLATE_ABORT;
 
cc-migrate_pfn = low_pfn;
@@ -893,7 +894,6 @@ static unsigned long compact_zone_order(struct zone *zone,
 bool sync, bool *contended,
 struct page **page)
 {
-   unsigned long ret;
struct compact_control cc = {
.nr_freepages = 0,
.nr_migratepages = 0,
@@ -901,15 +901,13 @@ static unsigned long compact_zone_order(struct zone *zone,
.migratetype = allocflags_to_migratetype(gfp_mask),
.zone = zone,
.sync = sync,
+   .contended = contended,
.page = page,
};
INIT_LIST_HEAD(cc.freepages);
INIT_LIST_HEAD(cc.migratepages);
 
-   ret = compact_zone(zone, cc);
-   if (contended)
-   *contended = cc.contended;
-   return ret;
+   return compact_zone(zone, cc);
 }
 
 int sysctl_extfrag_threshold = 500;
diff --git a/mm/internal.h b/mm/internal.h
index eebbed5..386772f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -131,7 +131,7 @@ struct compact_control {
int order;  /* order a direct compactor needs */
int migratetype;/* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
-   bool contended; /* True if a lock was contended */
+   bool *contended;/* True if a lock was contended */
struct page **page; /* Page captured of requested size */
 };
 
-- 
1.7.9.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 5/9] mm: compaction: Acquire the zone-lru_lock as late as possible

Compactions migrate scanner acquires the zone-lru_lock when scanning a range
of pages looking for LRU pages to acquire. It does this even if there are
no LRU pages in the range. If multiple processes are compacting then this
can cause severe locking contention. To make matters worse commit b2eef8c0
(mm: compaction: minimise the time IRQs are disabled while isolating pages
for migration) releases the lru_lock every SWAP_CLUSTER_MAX pages that are
scanned.

This patch makes two changes to how the migrate scanner acquires the LRU
lock. First, it only releases the LRU lock every SWAP_CLUSTER_MAX pages if
the lock is contended. This reduces the number of times it unnecessarily
disables and re-enables IRQs. The second is that it defers acquiring the
LRU lock for as long as possible. If there are no LRU pages or the only
LRU pages are transhuge then the LRU lock will not be acquired at all
which reduces contention on zone-lru_lock.

Signed-off-by: Mel Gorman mgor...@suse.de
Acked-by: Rik van Riel r...@redhat.com
---
 mm/compaction.c |   63 +--
 1 file changed, 43 insertions(+), 20 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 6b55491..a6068ff 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -50,6 +50,11 @@ static inline bool migrate_async_suitable(int migratetype)
return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
 }
 
+static inline bool should_release_lock(spinlock_t *lock)
+{
+   return need_resched() || spin_is_contended(lock);
+}
+
 /*
  * Compaction requires the taking of some coarse locks that are potentially
  * very heavily contended. Check if the process needs to be scheduled or
@@ -62,7 +67,7 @@ static inline bool migrate_async_suitable(int migratetype)
 static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
  bool locked, struct compact_control *cc)
 {
-   if (need_resched() || spin_is_contended(lock)) {
+   if (should_release_lock(lock)) {
if (locked) {
spin_unlock_irqrestore(lock, *flags);
locked = false;
@@ -327,7 +332,7 @@ isolate_migratepages_range(struct zone *zone, struct 
compact_control *cc,
isolate_mode_t mode = 0;
struct lruvec *lruvec;
unsigned long flags;
-   bool locked;
+   bool locked = false;
 
/*
 * Ensure that there are not too many pages isolated from the LRU
@@ -347,23 +352,17 @@ isolate_migratepages_range(struct zone *zone, struct 
compact_control *cc,
 
/* Time to isolate some pages for migration */
cond_resched();
-   spin_lock_irqsave(zone-lru_lock, flags);
-   locked = true;
for (; low_pfn  end_pfn; low_pfn++) {
struct page *page;
 
/* give a chance to irqs before checking need_resched() */
-   if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
-   spin_unlock_irqrestore(zone-lru_lock, flags);
-   locked = false;
+   if (locked  !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
+   if (should_release_lock(zone-lru_lock)) {
+   spin_unlock_irqrestore(zone-lru_lock, flags);
+   locked = false;
+   }
}
 
-   /* Check if it is ok to still hold the lock */
-   locked = compact_checklock_irqsave(zone-lru_lock, flags,
-   locked, cc);
-   if (!locked || fatal_signal_pending(current))
-   break;
-
/*
 * migrate_pfn does not necessarily start aligned to a
 * pageblock. Ensure that pfn_valid is called when moving
@@ -403,21 +402,38 @@ isolate_migratepages_range(struct zone *zone, struct 
compact_control *cc,
pageblock_nr = low_pfn  pageblock_order;
if (!cc-sync  last_pageblock_nr != pageblock_nr 
!migrate_async_suitable(get_pageblock_migratetype(page))) {
-   low_pfn += pageblock_nr_pages;
-   low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
-   last_pageblock_nr = pageblock_nr;
-   continue;
+   goto next_pageblock;
}
 
+   /* Check may be lockless but that's ok as we recheck later */
if (!PageLRU(page))
continue;
 
/*
-* PageLRU is set, and lru_lock excludes isolation,
-* splitting and collapsing (collapsing has already
-* happened if PageLRU is set).
+* PageLRU is set. lru_lock normally excludes isolation
+* splitting and collapsing (collapsing has already happened
+* if PageLRU is set) but the lock

[PATCH 4/9] mm: compaction: Abort compaction loop if lock is contended or run too long

From: Shaohua Li s...@fusionio.com

Changelog since V2
o Fix BUG_ON triggered due to pages left on cc.migratepages
o Make compact_zone_order() require non-NULL arg `contended'

Changelog since V1
o only abort the compaction if lock is contended or run too long
o Rearranged the code by Andrea Arcangeli.

isolate_migratepages_range() might isolate no pages if for example when
zone-lru_lock is contended and running asynchronous compaction. In this
case, we should abort compaction, otherwise, compact_zone will run a
useless loop and make zone-lru_lock is even contended.

[minc...@kernel.org: Putback pages isolated for migration if aborting]
[a...@linux-foundation.org: compact_zone_order requires non-NULL arg contended]
Signed-off-by: Andrea Arcangeli aarca...@redhat.com
Signed-off-by: Shaohua Li s...@fusionio.com
Signed-off-by: Mel Gorman mgor...@suse.de
Acked-by: Rik van Riel r...@redhat.com
---
 mm/compaction.c |   17 -
 mm/internal.h   |2 +-
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 614f18b..6b55491 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -70,8 +70,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, 
unsigned long *flags,
 
/* async aborts if taking too long or contended */
if (!cc-sync) {
-   if (cc-contended)
-   *cc-contended = true;
+   cc-contended = true;
return false;
}
 
@@ -686,7 +685,7 @@ static isolate_migrate_t isolate_migratepages(struct zone 
*zone,
 
/* Perform the isolation */
low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn);
-   if (!low_pfn)
+   if (!low_pfn || cc-contended)
return ISOLATE_ABORT;
 
cc-migrate_pfn = low_pfn;
@@ -846,6 +845,8 @@ static int compact_zone(struct zone *zone, struct 
compact_control *cc)
switch (isolate_migratepages(zone, cc)) {
case ISOLATE_ABORT:
ret = COMPACT_PARTIAL;
+   putback_lru_pages(cc-migratepages);
+   cc-nr_migratepages = 0;
goto out;
case ISOLATE_NONE:
continue;
@@ -894,6 +895,7 @@ static unsigned long compact_zone_order(struct zone *zone,
 bool sync, bool *contended,
 struct page **page)
 {
+   unsigned long ret;
struct compact_control cc = {
.nr_freepages = 0,
.nr_migratepages = 0,
@@ -901,13 +903,18 @@ static unsigned long compact_zone_order(struct zone *zone,
.migratetype = allocflags_to_migratetype(gfp_mask),
.zone = zone,
.sync = sync,
-   .contended = contended,
.page = page,
};
INIT_LIST_HEAD(cc.freepages);
INIT_LIST_HEAD(cc.migratepages);
 
-   return compact_zone(zone, cc);
+   ret = compact_zone(zone, cc);
+
+   VM_BUG_ON(!list_empty(cc.freepages));
+   VM_BUG_ON(!list_empty(cc.migratepages));
+
+   *contended = cc.contended;
+   return ret;
 }
 
 int sysctl_extfrag_threshold = 500;
diff --git a/mm/internal.h b/mm/internal.h
index 386772f..eebbed5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -131,7 +131,7 @@ struct compact_control {
int order;  /* order a direct compactor needs */
int migratetype;/* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
-   bool *contended;/* True if a lock was contended */
+   bool contended; /* True if a lock was contended */
struct page **page; /* Page captured of requested size */
 };
 
-- 
1.7.9.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 7/9] Revert mm: have order 0 compaction start off where it left

This reverts commit 7db8889a (mm: have order  0 compaction start off
where it left) and commit de74f1cc (mm: have order  0 compaction start
near a pageblock with free pages). These patches were a good idea and
tests confirmed that they massively reduced the amount of scanning but
the implementation is complex and tricky to understand. A later patch
will cache what pageblocks should be skipped and reimplements the
concept of compact_cached_free_pfn on top for both migration and
free scanners.

Signed-off-by: Mel Gorman mgor...@suse.de
Acked-by: Rik van Riel r...@redhat.com
---
 include/linux/mmzone.h |4 ---
 mm/compaction.c|   65 
 mm/internal.h  |6 -
 mm/page_alloc.c|5 
 4 files changed, 5 insertions(+), 75 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2daa54f..603d0b5 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -368,10 +368,6 @@ struct zone {
 */
spinlock_t  lock;
int all_unreclaimable; /* All pages pinned */
-#if defined CONFIG_COMPACTION || defined CONFIG_CMA
-   /* pfn where the last incremental compaction isolated free pages */
-   unsigned long   compact_cached_free_pfn;
-#endif
 #ifdef CONFIG_MEMORY_HOTPLUG
/* see spanned/present_pages for more description */
seqlock_t   span_seqlock;
diff --git a/mm/compaction.c b/mm/compaction.c
index 8e56594..9fc1b61 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -539,20 +539,6 @@ next_pageblock:
 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
 #ifdef CONFIG_COMPACTION
 /*
- * Returns the start pfn of the last page block in a zone.  This is the 
starting
- * point for full compaction of a zone.  Compaction searches for free pages 
from
- * the end of each zone, while isolate_freepages_block scans forward inside 
each
- * page block.
- */
-static unsigned long start_free_pfn(struct zone *zone)
-{
-   unsigned long free_pfn;
-   free_pfn = zone-zone_start_pfn + zone-spanned_pages;
-   free_pfn = ~(pageblock_nr_pages-1);
-   return free_pfn;
-}
-
-/*
  * Based on information in the current compact_control, find blocks
  * suitable for isolating free pages from and then isolate them.
  */
@@ -620,19 +606,8 @@ static void isolate_freepages(struct zone *zone,
 * looking for free pages, the search will restart here as
 * page migration may have returned some pages to the allocator
 */
-   if (isolated) {
+   if (isolated)
high_pfn = max(high_pfn, pfn);
-
-   /*
-* If the free scanner has wrapped, update
-* compact_cached_free_pfn to point to the highest
-* pageblock with free pages. This reduces excessive
-* scanning of full pageblocks near the end of the
-* zone
-*/
-   if (cc-order  0  cc-wrapped)
-   zone-compact_cached_free_pfn = high_pfn;
-   }
}
 
/* split_free_page does not map the pages */
@@ -640,11 +615,6 @@ static void isolate_freepages(struct zone *zone,
 
cc-free_pfn = high_pfn;
cc-nr_freepages = nr_freepages;
-
-   /* If compact_cached_free_pfn is reset then set it now */
-   if (cc-order  0  !cc-wrapped 
-   zone-compact_cached_free_pfn == start_free_pfn(zone))
-   zone-compact_cached_free_pfn = high_pfn;
 }
 
 /*
@@ -739,26 +709,8 @@ static int compact_finished(struct zone *zone,
if (fatal_signal_pending(current))
return COMPACT_PARTIAL;
 
-   /*
-* A full (order == -1) compaction run starts at the beginning and
-* end of a zone; it completes when the migrate and free scanner meet.
-* A partial (order  0) compaction can start with the free scanner
-* at a random point in the zone, and may have to restart.
-*/
-   if (cc-free_pfn = cc-migrate_pfn) {
-   if (cc-order  0  !cc-wrapped) {
-   /* We started partway through; restart at the end. */
-   unsigned long free_pfn = start_free_pfn(zone);
-   zone-compact_cached_free_pfn = free_pfn;
-   cc-free_pfn = free_pfn;
-   cc-wrapped = 1;
-   return COMPACT_CONTINUE;
-   }
-   return COMPACT_COMPLETE;
-   }
-
-   /* We wrapped around and ended up where we started. */
-   if (cc-wrapped  cc-free_pfn = cc-start_free_pfn)
+   /* Compaction run completes if the migrate and free scanner meet */
+   if (cc-free_pfn = cc-migrate_pfn)
return COMPACT_COMPLETE;
 
/*
@@ -864,15 +816,8 @@ static int

[PATCH 6/9] mm: compaction: Acquire the zone-lock as late as possible

Compactions free scanner acquires the zone-lock when checking for PageBuddy
pages and isolating them. It does this even if there are no PageBuddy pages
in the range.

This patch defers acquiring the zone lock for as long as possible. In the
event there are no free pages in the pageblock then the lock will not be
acquired at all which reduces contention on zone-lock.

Signed-off-by: Mel Gorman mgor...@suse.de
Acked-by: Rik van Riel r...@redhat.com
---
 mm/compaction.c |  141 ++-
 1 file changed, 78 insertions(+), 63 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index a6068ff..8e56594 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -93,6 +93,28 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock,
return compact_checklock_irqsave(lock, flags, false, cc);
 }
 
+/* Returns true if the page is within a block suitable for migration to */
+static bool suitable_migration_target(struct page *page)
+{
+
+   int migratetype = get_pageblock_migratetype(page);
+
+   /* Don't interfere with memory hot-remove or the min_free_kbytes blocks 
*/
+   if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
+   return false;
+
+   /* If the page is a large free page, then allow migration */
+   if (PageBuddy(page)  page_order(page) = pageblock_order)
+   return true;
+
+   /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
+   if (migrate_async_suitable(migratetype))
+   return true;
+
+   /* Otherwise skip the block */
+   return false;
+}
+
 static void compact_capture_page(struct compact_control *cc)
 {
unsigned long flags;
@@ -153,13 +175,16 @@ static void compact_capture_page(struct compact_control 
*cc)
  * pages inside of the pageblock (even though it may still end up isolating
  * some pages).
  */
-static unsigned long isolate_freepages_block(unsigned long blockpfn,
+static unsigned long isolate_freepages_block(struct compact_control *cc,
+   unsigned long blockpfn,
unsigned long end_pfn,
struct list_head *freelist,
bool strict)
 {
int nr_scanned = 0, total_isolated = 0;
struct page *cursor;
+   unsigned long flags;
+   bool locked = false;
 
cursor = pfn_to_page(blockpfn);
 
@@ -168,23 +193,38 @@ static unsigned long isolate_freepages_block(unsigned 
long blockpfn,
int isolated, i;
struct page *page = cursor;
 
-   if (!pfn_valid_within(blockpfn)) {
-   if (strict)
-   return 0;
-   continue;
-   }
+   if (!pfn_valid_within(blockpfn))
+   goto strict_check;
nr_scanned++;
 
-   if (!PageBuddy(page)) {
-   if (strict)
-   return 0;
-   continue;
-   }
+   if (!PageBuddy(page))
+   goto strict_check;
+
+   /*
+* The zone lock must be held to isolate freepages. This
+* unfortunately this is a very coarse lock and can be
+* heavily contended if there are parallel allocations
+* or parallel compactions. For async compaction do not
+* spin on the lock and we acquire the lock as late as
+* possible.
+*/
+   locked = compact_checklock_irqsave(cc-zone-lock, flags,
+   locked, cc);
+   if (!locked)
+   break;
+
+   /* Recheck this is a suitable migration target under lock */
+   if (!strict  !suitable_migration_target(page))
+   break;
+
+   /* Recheck this is a buddy page under lock */
+   if (!PageBuddy(page))
+   goto strict_check;
 
/* Found a free page, break it into order-0 pages */
isolated = split_free_page(page);
if (!isolated  strict)
-   return 0;
+   goto strict_check;
total_isolated += isolated;
for (i = 0; i  isolated; i++) {
list_add(page-lru, freelist);
@@ -196,9 +236,23 @@ static unsigned long isolate_freepages_block(unsigned long 
blockpfn,
blockpfn += isolated - 1;
cursor += isolated - 1;
}
+
+   continue;
+
+strict_check:
+   /* Abort isolation if the caller requested strict isolation */
+   if (strict) {
+   total_isolated = 0;
+   goto out;
+   }
}

[PATCH 9/9] mm: compaction: Restart compaction from near where it left off

This is almost entirely based on Rik's previous patches and discussions
with him about how this might be implemented.

Order  0 compaction stops when enough free pages of the correct page
order have been coalesced.  When doing subsequent higher order allocations,
it is possible for compaction to be invoked many times.

However, the compaction code always starts out looking for things to compact
at the start of the zone, and for free pages to compact things to at the
end of the zone.

This can cause quadratic behaviour, with isolate_freepages starting at
the end of the zone each time, even though previous invocations of the
compaction code already filled up all free memory on that end of the zone.
This can cause isolate_freepages to take enormous amounts of CPU with
certain workloads on larger memory systems.

This patch caches where the migration and free scanner should start from on
subsequent compaction invocations using the pageblock-skip information. When
compaction starts it begins from the cached restart points and will
update the cached restart points until a page is isolated or a pageblock
is skipped that would have been scanned by synchronous compaction.

Signed-off-by: Mel Gorman mgor...@suse.de
Acked-by: Rik van Riel r...@redhat.com
---
 include/linux/mmzone.h |4 
 mm/compaction.c|   54 
 mm/internal.h  |4 
 3 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a456361..e7792a3 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -370,6 +370,10 @@ struct zone {
int all_unreclaimable; /* All pages pinned */
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
unsigned long   compact_blockskip_expire;
+
+   /* pfns where compaction scanners should start */
+   unsigned long   compact_cached_free_pfn;
+   unsigned long   compact_cached_migrate_pfn;
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG
/* see spanned/present_pages for more description */
diff --git a/mm/compaction.c b/mm/compaction.c
index 9276bc8..4bd96f3 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -79,6 +79,9 @@ static void reset_isolation_suitable(struct zone *zone)
 */
if (time_before(jiffies, zone-compact_blockskip_expire))
return;
+
+   zone-compact_cached_migrate_pfn = start_pfn;
+   zone-compact_cached_free_pfn = end_pfn;
zone-compact_blockskip_expire = jiffies + (HZ * 5);
 
/* Walk the zone and mark every pageblock as suitable for isolation */
@@ -99,13 +102,29 @@ static void reset_isolation_suitable(struct zone *zone)
  * If no pages were isolated then mark this pageblock to be skipped in the
  * future. The information is later cleared by reset_isolation_suitable().
  */
-static void update_pageblock_skip(struct page *page, unsigned long nr_isolated)
+static void update_pageblock_skip(struct compact_control *cc,
+   struct page *page, unsigned long nr_isolated,
+   bool migrate_scanner)
 {
+   struct zone *zone = cc-zone;
if (!page)
return;
 
-   if (!nr_isolated)
+   if (!nr_isolated) {
+   unsigned long pfn = page_to_pfn(page);
set_pageblock_skip(page);
+
+   /* Update where compaction should restart */
+   if (migrate_scanner) {
+   if (!cc-finished_update_migrate 
+   pfn  zone-compact_cached_migrate_pfn)
+   zone-compact_cached_migrate_pfn = pfn;
+   } else {
+   if (!cc-finished_update_free 
+   pfn  zone-compact_cached_free_pfn)
+   zone-compact_cached_free_pfn = pfn;
+   }
+   }
 }
 
 static inline bool should_release_lock(spinlock_t *lock)
@@ -315,7 +334,7 @@ out:
 
/* Update the pageblock-skip if the whole pageblock was scanned */
if (blockpfn == end_pfn)
-   update_pageblock_skip(valid_page, total_isolated);
+   update_pageblock_skip(cc, valid_page, total_isolated, false);
 
return total_isolated;
 }
@@ -530,6 +549,7 @@ isolate_migratepages_range(struct zone *zone, struct 
compact_control *cc,
 */
if (!cc-sync  last_pageblock_nr != pageblock_nr 
!migrate_async_suitable(get_pageblock_migratetype(page))) {
+   cc-finished_update_migrate = true;
goto next_pageblock;
}
 
@@ -578,6 +598,7 @@ isolate_migratepages_range(struct zone *zone, struct 
compact_control *cc,
VM_BUG_ON(PageTransCompound(page));
 
/* Successfully isolated */
+   cc-finished_update_migrate = true;
del_page_from_lru_list(page, lruvec, page_lru(page));

[PATCH 8/9] mm: compaction: Cache if a pageblock was scanned and no pages were isolated

When compaction was implemented it was known that scanning could potentially
be excessive. The ideal was that a counter be maintained for each pageblock
but maintaining this information would incur a severe penalty due to a
shared writable cache line. It has reached the point where the scanning
costs are an serious problem, particularly on long-lived systems where a
large process starts and allocates a large number of THPs at the same time.

Instead of using a shared counter, this patch adds another bit to the
pageblock flags called PG_migrate_skip. If a pageblock is scanned by
either migrate or free scanner and 0 pages were isolated, the pageblock
is marked to be skipped in the future. When scanning, this bit is checked
before any scanning takes place and the block skipped if set.

The main difficulty with a patch like this is when to ignore the cached
information? If it's ignored too often, the scanning rates will still
be excessive. If the information is too stale then allocations will fail
that might have otherwise succeeded. In this patch

o CMA always ignores the information
o If the migrate and free scanner meet then the cached information will
  be discarded if it's at least 5 seconds since the last time the cache
  was discarded
o If there are a large number of allocation failures, discard the cache.

The time-based heuristic is very clumsy but there are few choices for a
better event. Depending solely on multiple allocation failures still allows
excessive scanning when THP allocations are failing in quick succession
due to memory pressure. Waiting until memory pressure is relieved would
cause compaction to continually fail instead of using reclaim/compaction
to try allocate the page. The time-based mechanism is clumsy but a better
option is not obvious.

Signed-off-by: Mel Gorman mgor...@suse.de
Acked-by: Rik van Riel r...@redhat.com
---
 include/linux/mmzone.h  |3 ++
 include/linux/pageblock-flags.h |   19 +++-
 mm/compaction.c |   93 +--
 mm/internal.h   |1 +
 mm/page_alloc.c |1 +
 5 files changed, 111 insertions(+), 6 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 603d0b5..a456361 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -368,6 +368,9 @@ struct zone {
 */
spinlock_t  lock;
int all_unreclaimable; /* All pages pinned */
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+   unsigned long   compact_blockskip_expire;
+#endif
 #ifdef CONFIG_MEMORY_HOTPLUG
/* see spanned/present_pages for more description */
seqlock_t   span_seqlock;
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 19ef95d..eed27f4 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -30,6 +30,9 @@ enum pageblock_bits {
PB_migrate,
PB_migrate_end = PB_migrate + 3 - 1,
/* 3 bits required for migrate types */
+#ifdef CONFIG_COMPACTION
+   PB_migrate_skip,/* If set the block is skipped by compaction */
+#endif /* CONFIG_COMPACTION */
NR_PAGEBLOCK_BITS
 };
 
@@ -65,10 +68,22 @@ unsigned long get_pageblock_flags_group(struct page *page,
 void set_pageblock_flags_group(struct page *page, unsigned long flags,
int start_bitidx, int end_bitidx);
 
+#ifdef CONFIG_COMPACTION
+#define get_pageblock_skip(page) \
+   get_pageblock_flags_group(page, PB_migrate_skip, \
+   PB_migrate_skip + 1)
+#define clear_pageblock_skip(page) \
+   set_pageblock_flags_group(page, 0, PB_migrate_skip,  \
+   PB_migrate_skip + 1)
+#define set_pageblock_skip(page) \
+   set_pageblock_flags_group(page, 1, PB_migrate_skip,  \
+   PB_migrate_skip + 1)
+#endif /* CONFIG_COMPACTION */
+
 #define get_pageblock_flags(page) \
-   get_pageblock_flags_group(page, 0, NR_PAGEBLOCK_BITS-1)
+   get_pageblock_flags_group(page, 0, PB_migrate_end)
 #define set_pageblock_flags(page, flags) \
set_pageblock_flags_group(page, flags,  \
- 0, NR_PAGEBLOCK_BITS-1)
+ 0, PB_migrate_end)
 
 #endif /* PAGEBLOCK_FLAGS_H */
diff --git a/mm/compaction.c b/mm/compaction.c
index 9fc1b61..9276bc8 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -50,6 +50,64 @@ static inline bool migrate_async_suitable(int migratetype)
return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
 }
 
+/* Returns true if the pageblock should be scanned for pages to isolate. */
+static inline bool

[PATCH 2/9] Revert mm-compaction-abort-compaction-loop-if-lock-is-contended-or-run-too-long-fix

This reverts
mm-compaction-abort-compaction-loop-if-lock-is-contended-or-run-too-long-fix
as it is replaced by a later patch in the series.

Signed-off-by: Mel Gorman mgor...@suse.de
---
 mm/compaction.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 4a77b4b..1c873bb 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -907,7 +907,8 @@ static unsigned long compact_zone_order(struct zone *zone,
INIT_LIST_HEAD(cc.migratepages);
 
ret = compact_zone(zone, cc);
-   *contended = cc.contended;
+   if (contended)
+   *contended = cc.contended;
return ret;
 }
 
-- 
1.7.9.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH v3 00/19] ACPI memory hotplug

This is v3 of the ACPI memory hotplug functionality. Only x86_64 target is 
supported
for now.

Overview:

Dimm device layout is modeled with a new qemu command line 

-dimm id=name,size=sz,node=pxm,populated=on|off

The starting physical address for all dimms is calculated automatically from
top of memory, skipping the pci hole at [PCI_HOLE_START, 4G).
Node is defining numa proximity for this dimm. When not defined it defaults
to zero.
-dimm id=dimm0,size=512M,node=0,populated=off
will define a 512M memory slot belonging to numa node 0.

Dimms are added or removed with normal device_add, device_del operations:
Hot-add syntax: device_add dimm,id=mydimm0
Hot-remove syntax: dimm_del dimm,id=mydimm0

Changes v2-v3

- qdev integration. Dimms are attached to a dimmbus. The dimmbus is a child
  of i440fx device in the pc machine. Hot-add and hot-remove are done with 
normal
  device_add / device_del operations on the dimmbus. New commands dimm_add and
  dimm_del are obsolete. (In previous versions, dimms were always present on 
the
  qdev tree, and dimm_add/del simply meant allocating or deallocating memory for
  the devices. This version actually does hot-operations on the qdev tree)
- Add _PS3 method to allow OSPM-induced hot operations.
- pci-window calculation in Seabios takes dimms into account(for both 32-bit and
  64-bit windows)
- rename new qmp commands: query-memory-total and query-memory-hotplug
- balloon driver can see the hotplugged memory

Changes v1-v2

- memory map is automatically calculated for hotplug dimms. Dimms are added from
top-of-memory skipping the pci hole at [PCI_HOLE_START, 4G).
- Renamed from -memslot to -dimm. Commands changed to dimm_add, 
dimm_del.
- Seabios ejection array reduced to a byte. Use extraction macros for dimm ssdt.
- additional SRAT paravirt info does not break previous SRAT fw_cfg layout.
- Documentation of new acpi_piix4 registers and paravirt data.
- add ACPI _OST support for _OST enabled guests. This allows qemu to receive
notification for success / failure of memory hot-add and hot-remove operations.
Guest needs to support _OST (https://lkml.org/lkml/2012/6/25/321)
- add monitor info command to report total guest memory (initial + hot-added)
- add command line options and monitor commands for batch dimm
creation/population (obsolete from v3 onwards)

Issues:

- A main blocker issue is windows guest functionality. The patchset does not 
work for
windows currently. My guess is the windows pnpmem driver does not like the
seabios dimm device implementation (or the seabios dimm implementation is not
fully ACPI-compliant). If someone can review the seabios patches or has any
ideas to debug this, let me know.

Testing on win2012 server RC or windows2008 consumer prerelease. When adding a
DIMM, the device shows up in DeviceManager but does not work. 
Relevant messages:

 This device cannot start. (Code 10) 
Device configured(memory.inf) (UserPnP eventID 400)
Device installed (memory.inf) ACPI/PNP0C80\2daba3ff1 was configured 
Device not started(PNPMEM) (Kernel-PnP eventID 411, kernelID) 
Device ACPI\PNP0C80\2daba3ff1 had a problem starting Driver Name: memory.inf
(c:\Windows\system32\DRIVERS\pnpmem.sys 6.2.8400 winmain_win8rc))
Memory range:0x8000 - 0x9000 (Initial memory of VM is 2GB. The 
hotplugged DIMM
 was a 256GB with physical address range starting at 2GB ) 
Conflicting device list: No conflicts.  

Adding a 2nd or more dimms causes a crash (PNP_DETECTED_FATAL_ERROR with blue
screen of death) and makes windows reboot. After this, the VM keeps rebooting 
with
ACPI_BIOS_ERROR. The VM refuses to boot anymore once a 2nd (or more) extra dimm 
is
plugged-in.

- Is the dimmbus the correct way to go about integrating into qdev/qom? In a v1
comment, Anthony mentioned attaching dimms directly to an i440fx device as
children. Is this possible without a bus?

- Live migration works as long as the dimm layout (-dimm command line args) are
identical at the source and destination qemu command line. Patch 10/19
creates the DimmDevice that corresponds to the unknown incoming ramblock. 
Ramblocks are migrated before qdev VMStates are migrated (the DimmDevice 
structure
currently does not define a VMStateDescription). So the DimmDevice is handled
diferrently than other devices. If this is not acceptable, any suggestions on
how should it be reworked?

- Hot-operation notification lists need to be added to migration state.

Please review. Could people state which other issues they consider blocker for
including this upstream?

Does this patchset need to wait for 1.4 or could this be considered for 1.3 
(assuming
blockers are resolved)? The patchset has been revised every few months, but
I will provide quicker version updates onwards. I can also bring this up on a 
weekly
meeting agenda if needed. 

series is based on uq/master for qemu-kvm, and master for seabios. Can be found
also at:

http://github.com/vliaskov/qemu-kvm/commits/memhp-v3

[RFC PATCH v3 01/19][SeaBIOS] Add ACPI_EXTRACT_DEVICE* macros

This allows to extract the beginning, end and name of a Device object.

Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com
---
 tools/acpi_extract.py |   28 
 1 files changed, 28 insertions(+), 0 deletions(-)

diff --git a/tools/acpi_extract.py b/tools/acpi_extract.py
index 167a322..cb2540e 100755
--- a/tools/acpi_extract.py
+++ b/tools/acpi_extract.py
@@ -195,6 +195,28 @@ def aml_package_start(offset):
 offset += 1
 return offset + aml_pkglen_bytes(offset) + 1
 
+def aml_device_start(offset):
+#0x5B 0x82 DeviceOp PkgLength NameString ProcID
+if ((aml[offset] != 0x5B) or (aml[offset + 1] != 0x82)):
+die( Name offset 0x%x: expected 0x5B 0x83 actual 0x%x 0x%x %
+ (offset, aml[offset], aml[offset + 1]));
+return offset
+
+def aml_device_string(offset):
+#0x5B 0x82 DeviceOp PkgLength NameString ProcID
+start = aml_device_start(offset)
+offset += 2
+pkglenbytes = aml_pkglen_bytes(offset)
+offset += pkglenbytes
+return offset
+
+def aml_device_end(offset):
+start = aml_device_start(offset)
+offset += 2
+pkglenbytes = aml_pkglen_bytes(offset)
+pkglen = aml_pkglen(offset)
+return offset + pkglen
+
 lineno = 0
 for line in fileinput.input():
 # Strip trailing newline
@@ -279,6 +301,12 @@ for i in range(len(asl)):
 offset = aml_processor_end(offset)
 elif (directive == ACPI_EXTRACT_PKG_START):
 offset = aml_package_start(offset)
+elif (directive == ACPI_EXTRACT_DEVICE_START):
+offset = aml_device_start(offset)
+elif (directive == ACPI_EXTRACT_DEVICE_STRING):
+offset = aml_device_string(offset)
+elif (directive == ACPI_EXTRACT_DEVICE_END):
+offset = aml_device_end(offset)
 else:
 die(Unsupported directive %s % directive)
 
-- 
1.7.9

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH v3 03/19][SeaBIOS] acpi-dsdt: Implement functions for memory hotplug

Extend the DSDT to include methods for handling memory hot-add and hot-remove
notifications and memory device status requests. These functions are called
from the memory device SSDT methods.

Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com
---
 src/acpi-dsdt.dsl |   70 +++-
 1 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/src/acpi-dsdt.dsl b/src/acpi-dsdt.dsl
index 2060686..5d3e92b 100644
--- a/src/acpi-dsdt.dsl
+++ b/src/acpi-dsdt.dsl
@@ -737,6 +737,71 @@ DefinitionBlock (
 }
 Return(One)
 }
+/* Objects filled in by run-time generated SSDT */
+External(MTFY, MethodObj)
+External(MEON, PkgObj)
+
+Method (CMST, 1, NotSerialized) {
+// _STA method - return ON status of memdevice
+// Local0 = MEON flag for this cpu
+Store(DerefOf(Index(MEON, Arg0)), Local0)
+If (Local0) { Return(0xF) } Else { Return(0x0) }
+}
+
+/* Memory hotplug notify array */
+OperationRegion(MEST, SystemIO, 0xaf80, 32)
+Field (MEST, ByteAcc, NoLock, Preserve)
+{
+MES, 256
+}
+ 
+/* Memory eject byte */
+OperationRegion(MEMJ, SystemIO, 0xafa0, 1)
+Field (MEMJ, ByteAcc, NoLock, Preserve)
+{
+MPE, 8
+}
+
+Method(MESC, 0) {
+// Local5 = active memdevice bitmap
+Store (MES, Local5)
+// Local2 = last read byte from bitmap
+Store (Zero, Local2)
+// Local0 = memory device iterator
+Store (Zero, Local0)
+While (LLess(Local0, SizeOf(MEON))) {
+// Local1 = MEON flag for this memory device
+Store(DerefOf(Index(MEON, Local0)), Local1)
+If (And(Local0, 0x07)) {
+// Shift down previously read bitmap byte
+ShiftRight(Local2, 1, Local2)
+} Else {
+// Read next byte from memdevice bitmap
+Store(DerefOf(Index(Local5, ShiftRight(Local0, 3))), 
Local2)
+}
+// Local3 = active state for this memory device
+Store(And(Local2, 1), Local3)
+
+If (LNotEqual(Local1, Local3)) {
+// State change - update MEON with new state
+Store(Local3, Index(MEON, Local0))
+// Do MEM notify
+If (LEqual(Local3, 1)) {
+MTFY(Local0, 1)
+} Else {
+MTFY(Local0, 3)
+}
+}
+Increment(Local0)
+}
+Return(One)
+}
+
+Method (MPEJ, 2, NotSerialized) {
+// _EJ0 method - eject callback
+Store(Arg0, MPE)
+Sleep(200)
+}
 }
 
 
@@ -759,8 +824,9 @@ DefinitionBlock (
 // CPU hotplug event
 Return(\_SB.PRSC())
 }
-Method(_L03) {
-Return(0x01)
+Method(_E03) {
+// Memory hotplug event
+Return(\_SB.MESC())
 }
 Method(_L04) {
 Return(0x01)
-- 
1.7.9

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH v3 04/19][SeaBIOS] acpi: generate hotplug memory devices

The memory device generation is guided by qemu paravirt info. Seabios
first uses the info to setup SRAT entries for the hotplug-able memory slots.
Afterwards, build_memssdt uses the created SRAT entries to generate
appropriate memory device objects. One memory device (and corresponding SRAT
entry) is generated for each hotplug-able qemu memslot. Currently no SSDT
memory device is created for initial system memory.

We only support up to 255 DIMMs for now (PackageOp used for the MEON array can
only describe an array of at most 255 elements. VarPackageOp would be needed to
support more than 255 devices)

v1-v2:
Seabios reads mems_sts from qemu to build e820_map
SSDT size and some offsets are calculated with extraction macros.

v2-v3:
Minor name changes

Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com
---
 src/acpi.c |  158 +--
 1 files changed, 152 insertions(+), 6 deletions(-)

diff --git a/src/acpi.c b/src/acpi.c
index 6d239fa..1223b52 100644
--- a/src/acpi.c
+++ b/src/acpi.c
@@ -13,6 +13,7 @@
 #include pci_regs.h // PCI_INTERRUPT_LINE
 #include ioport.h // inl
 #include paravirt.h // qemu_cfg_irq0_override
+#include memmap.h
 
 //
 /* ACPI tables init */
@@ -416,11 +417,26 @@ encodeLen(u8 *ssdt_ptr, int length, int bytes)
 #define PCIHP_AML (ssdp_pcihp_aml + *ssdt_pcihp_start)
 #define PCI_SLOTS 32
 
+/* 0x5B 0x82 DeviceOp PkgLength NameString DimmID */
+#define MEM_BASE 0xaf80
+#define MEM_AML (ssdm_mem_aml + *ssdt_mem_start)
+#define MEM_SIZEOF (*ssdt_mem_end - *ssdt_mem_start)
+#define MEM_OFFSET_HEX (*ssdt_mem_name - *ssdt_mem_start + 2)
+#define MEM_OFFSET_ID (*ssdt_mem_id - *ssdt_mem_start)
+#define MEM_OFFSET_PXM 31
+#define MEM_OFFSET_START 55
+#define MEM_OFFSET_END   63
+#define MEM_OFFSET_SIZE  79
+
+u64 nb_hp_memslots = 0;
+struct srat_memory_affinity *mem;
+
 #define SSDT_SIGNATURE 0x54445353 // SSDT
 #define SSDT_HEADER_LENGTH 36
 
 #include ssdt-susp.hex
 #include ssdt-pcihp.hex
+#include ssdt-mem.hex
 
 #define PCI_RMV_BASE 0xae0c
 
@@ -472,6 +488,111 @@ static void patch_pcihp(int slot, u8 *ssdt_ptr, u32 eject)
 }
 }
 
+static void build_memdev(u8 *ssdt_ptr, int i, u64 mem_base, u64 mem_len, u8 
node)
+{
+memcpy(ssdt_ptr, MEM_AML, MEM_SIZEOF);
+ssdt_ptr[MEM_OFFSET_HEX] = getHex(i  4);
+ssdt_ptr[MEM_OFFSET_HEX+1] = getHex(i);
+ssdt_ptr[MEM_OFFSET_ID] = i;
+ssdt_ptr[MEM_OFFSET_PXM] = node;
+*(u64*)(ssdt_ptr + MEM_OFFSET_START) = mem_base;
+*(u64*)(ssdt_ptr + MEM_OFFSET_END) = mem_base + mem_len;
+*(u64*)(ssdt_ptr + MEM_OFFSET_SIZE) = mem_len;
+}
+
+static void*
+build_memssdt(void)
+{
+u64 mem_base;
+u64 mem_len;
+u8  node;
+int i;
+struct srat_memory_affinity *entry = mem;
+u64 nb_memdevs = nb_hp_memslots;
+u8  memslot_status, enabled;
+
+int length = ((1+3+4)
+  + (nb_memdevs * MEM_SIZEOF)
+  + (1+2+5+(12*nb_memdevs))
+  + (6+2+1+(1*nb_memdevs)));
+u8 *ssdt = malloc_high(sizeof(struct acpi_table_header) + length);
+if (! ssdt) {
+warn_noalloc();
+return NULL;
+}
+u8 *ssdt_ptr = ssdt + sizeof(struct acpi_table_header);
+
+// build Scope(_SB_) header
+*(ssdt_ptr++) = 0x10; // ScopeOp
+ssdt_ptr = encodeLen(ssdt_ptr, length-1, 3);
+*(ssdt_ptr++) = '_';
+*(ssdt_ptr++) = 'S';
+*(ssdt_ptr++) = 'B';
+*(ssdt_ptr++) = '_';
+
+for (i = 0; i  nb_memdevs; i++) {
+mem_base = (((u64)(entry-base_addr_high)  32 )| 
entry-base_addr_low);
+mem_len = (((u64)(entry-length_high)  32 )| entry-length_low);
+node = entry-proximity[0];
+build_memdev(ssdt_ptr, i, mem_base, mem_len, node);
+ssdt_ptr += MEM_SIZEOF;
+entry++;
+}
+
+// build Method(MTFY, 2) {If (LEqual(Arg0, 0x00)) {Notify(CM00, Arg1)} 
...}
+*(ssdt_ptr++) = 0x14; // MethodOp
+ssdt_ptr = encodeLen(ssdt_ptr, 2+5+(12*nb_memdevs), 2);
+*(ssdt_ptr++) = 'M';
+*(ssdt_ptr++) = 'T';
+*(ssdt_ptr++) = 'F';
+*(ssdt_ptr++) = 'Y';
+*(ssdt_ptr++) = 0x02;
+for (i=0; inb_memdevs; i++) {
+*(ssdt_ptr++) = 0xA0; // IfOp
+   ssdt_ptr = encodeLen(ssdt_ptr, 11, 1);
+*(ssdt_ptr++) = 0x93; // LEqualOp
+*(ssdt_ptr++) = 0x68; // Arg0Op
+*(ssdt_ptr++) = 0x0A; // BytePrefix
+*(ssdt_ptr++) = i;
+*(ssdt_ptr++) = 0x86; // NotifyOp
+*(ssdt_ptr++) = 'M';
+*(ssdt_ptr++) = 'P';
+*(ssdt_ptr++) = getHex(i  4);
+*(ssdt_ptr++) = getHex(i);
+*(ssdt_ptr++) = 0x69; // Arg1Op
+}
+
+// build Name(MEON, Package() { One, One, ..., Zero, Zero, ... })
+*(ssdt_ptr++) = 0x08; // NameOp
+*(ssdt_ptr++) = 'M';
+*(ssdt_ptr++) = 'E';
+*(ssdt_ptr++) = 'O';
+*(ssdt_ptr++) = 'N';
+*(ssdt_ptr++) = 0x12; // PackageOp
+ssdt_ptr = encodeLen(ssdt_ptr, 2+1+(1*nb_memdevs), 2);
+*(ssdt_ptr++) =

[RFC PATCH v3 10/19] fix live-migration when populated=on is missing

Live migration works after memory hot-add events, as long as the
qemu command line -dimm arguments are changed on the destination host
to specify populated=on for the dimms that have been hot-added.

If a command-line change has not occured, the destination host does not yet
have the corresponding ramblock in its ram_list. Activate the dimm on the
destination during ram_load.

Perhaps several fields of the DimmDevice should be part of a
VMStateDescription to handle migration in a cleaner way. But the problem
is that ramblocks are checked before qdev vmstates.

Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com
---
 arch_init.c |   24 +---
 1 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index 5a1173e..b63caa7 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -45,6 +45,7 @@
 #include hw/pcspk.h
 #include qemu/page_cache.h
 #include qmp-commands.h
+#include hw/dimm.h
 
 #ifdef DEBUG_ARCH_INIT
 #define DPRINTF(fmt, ...) \
@@ -740,10 +741,27 @@ static int ram_load(QEMUFile *f, void *opaque, int 
version_id)
 }
 
 if (!block) {
-fprintf(stderr, Unknown ramblock \%s\, cannot 
+/* this can happen if a dimm was hot-added at source 
host */
+bool ramblock_found = false;
+if (dimm_add(id)) {
+fprintf(stderr, Cannot add unknown ramblock 
\%s\, 
+cannot accept migration\n, id);
+ret = -EINVAL;
+goto done;
+}
+/* rescan ram_list, verify ramblock is there now */
+QLIST_FOREACH(block, ram_list.blocks, next) {
+if (!strncmp(id, block-idstr, sizeof(id))) {
+ramblock_found = true;
+break;
+}
+}
+if (!ramblock_found) {
+fprintf(stderr, Unknown ramblock \%s\, cannot 
 accept migration\n, id);
-ret = -EINVAL;
-goto done;
+ret = -EINVAL;
+goto done;
+}
 }
 
 total_ram_bytes -= length;
-- 
1.7.9

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH v3 09/19] pc: Add dimm paravirt SRAT info

The numa_fw_cfg paravirt interface is extended to include SRAT information for
all hotplug-able dimms. There are 3 words for each hotplug-able memory slot,
denoting start address, size and node proximity. The new info is appended after
existing numa info, so that the fw_cfg layout does not break.  This information
is used by Seabios to build hotplug memory device objects at runtime.
nb_numa_nodes is set to 1 by default (not 0), so that we always pass srat info
to SeaBIOS.

v1-v2:
Dimm SRAT info (#dimms) is appended at end of existing numa fw_cfg in order not
to break existing layout
Documentation of the new fwcfg layout is included in docs/specs/fwcfg.txt

Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com
---
 docs/specs/fwcfg.txt |   28 
 hw/pc.c  |   14 --
 2 files changed, 40 insertions(+), 2 deletions(-)
 create mode 100644 docs/specs/fwcfg.txt

diff --git a/docs/specs/fwcfg.txt b/docs/specs/fwcfg.txt
new file mode 100644
index 000..55f96d9
--- /dev/null
+++ b/docs/specs/fwcfg.txt
@@ -0,0 +1,28 @@
+QEMU-BIOS Paravirt Documentation
+--
+
+This document describes paravirt data structures passed from QEMU to BIOS.
+
+FW_CFG_NUMA paravirt info
+
+The SRAT info passed from QEMU to BIOS has the following layout:
+
+---
+#nodes | cpu0_pxm | cpu1_pxm | ... | cpulast_pxm | node0_mem | node1_mem | ... 
| nodelast_mem
+
+---
+#dimms | dimm0_start | dimm0_sz | dimm0_pxm | ... | dimmlast_start | 
dimmlast_sz | dimmlast_pxm
+
+Entry 0 contains the number of numa nodes (nb_numa_nodes).
+
+Entries 1..max_cpus: The next max_cpus entries describe node proximity for each
+one of the vCPUs in the system.
+
+Entries max_cpus+1..max_cpus+nb_numa_nodes+1:  The next nb_numa_nodes entries
+describe the memory size for each one of the NUMA nodes in the system.
+
+Entry max_cpus+nb_numa_nodes+1 contains the number of memory dimms 
(nb_hp_dimms)
+
+The last 3 * nb_hp_dimms entries are organized in triplets: Each triplet 
contains
+the physical address offset, size (in bytes), and node proximity for the
+respective dimm.
diff --git a/hw/pc.c b/hw/pc.c
index 2c9664d..f2604ae 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -598,6 +598,7 @@ static void *bochs_bios_init(void)
 uint8_t *smbios_table;
 size_t smbios_len;
 uint64_t *numa_fw_cfg;
+uint64_t *hp_dimms_fw_cfg;
 int i, j;
 
 register_ioport_write(0x400, 1, 2, bochs_bios_write, NULL);
@@ -632,8 +633,10 @@ static void *bochs_bios_init(void)
 /* allocate memory for the NUMA channel: one (64bit) word for the number
  * of nodes, one word for each VCPU-node and one word for each node to
  * hold the amount of memory.
+ * Finally one word for the number of hotplug memory slots and three words
+ * for each hotplug memory slot (start address, size and node proximity).
  */
-numa_fw_cfg = g_malloc0((1 + max_cpus + nb_numa_nodes) * 8);
+numa_fw_cfg = g_malloc0((2 + max_cpus + nb_numa_nodes + 3 * nb_hp_dimms) * 
8);
 numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes);
 for (i = 0; i  max_cpus; i++) {
 for (j = 0; j  nb_numa_nodes; j++) {
@@ -646,8 +649,15 @@ static void *bochs_bios_init(void)
 for (i = 0; i  nb_numa_nodes; i++) {
 numa_fw_cfg[max_cpus + 1 + i] = cpu_to_le64(node_mem[i]);
 }
+
+numa_fw_cfg[1 + max_cpus + nb_numa_nodes] = cpu_to_le64(nb_hp_dimms);
+
+hp_dimms_fw_cfg = numa_fw_cfg + 2 + max_cpus + nb_numa_nodes;
+if (nb_hp_dimms)
+setup_fwcfg_hp_dimms(hp_dimms_fw_cfg);
+
 fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, (uint8_t *)numa_fw_cfg,
- (1 + max_cpus + nb_numa_nodes) * 8);
+ (2 + max_cpus + nb_numa_nodes + 3 * nb_hp_dimms) * 8);
 
 return fw_cfg;
 }
-- 
1.7.9

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH v3 11/19] Implement qmp and hmp commands for notification lists

Guest can respond to ACPI hotplug events e.g. with _EJ or _OST method.
This patch implements a tail queue to store guest notifications for memory
hot-add and hot-remove requests.

Guest responses for memory hotplug command on a per-dimm basis can be detected
with the new hmp command info memhp or the new qmp command query-memhp
Examples:

(qemu) device_add dimm,id=ram0
(qemu) info memory-hotplug
dimm: ram0 hot-add success
or
dimm: ram0 hot-add failure

(qemu) device_del ram3
(qemu) info memory-hotplug
dimm: ram3 hot-remove success
or
dimm: ram3 hot-remove failure

Results are removed from the queue once read.

This patch only queues _EJ events that signal hot-remove success.
For  _OST event queuing, which cover the hot-remove failure and
hot-add success/failure cases, the _OST patches in this series are  are also
needed.

These notification items should probably be part of migration state (not yet
implemented).

Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com
---
 hmp-commands.hx  |2 +
 hmp.c|   17 ++
 hmp.h|1 +
 hw/dimm.c|   62 +-
 hw/dimm.h|2 +-
 monitor.c|7 ++
 qapi-schema.json |   26 ++
 qmp-commands.hx  |   37 
 8 files changed, 152 insertions(+), 2 deletions(-)

diff --git a/hmp-commands.hx b/hmp-commands.hx
index ed67e99..cfb1b67 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1462,6 +1462,8 @@ show device tree
 show qdev device model list
 @item info roms
 show roms
+@item info memory-hotplug
+show memory-hotplug
 @end table
 ETEXI
 
diff --git a/hmp.c b/hmp.c
index ba6fbd3..4b3d63d 100644
--- a/hmp.c
+++ b/hmp.c
@@ -1168,3 +1168,20 @@ void hmp_screen_dump(Monitor *mon, const QDict *qdict)
 qmp_screendump(filename, err);
 hmp_handle_error(mon, err);
 }
+
+void hmp_info_memory_hotplug(Monitor *mon)
+{
+MemHpInfoList *info;
+MemHpInfoList *item;
+MemHpInfo *dimm;
+
+info = qmp_query_memory_hotplug(NULL);
+for (item = info; item; item = item-next) {
+dimm = item-value;
+monitor_printf(mon, dimm: %s %s %s\n, dimm-dimm,
+dimm-request, dimm-result);
+dimm-dimm = NULL;
+}
+
+qapi_free_MemHpInfoList(info);
+}
diff --git a/hmp.h b/hmp.h
index 48b9c59..986705a 100644
--- a/hmp.h
+++ b/hmp.h
@@ -73,5 +73,6 @@ void hmp_getfd(Monitor *mon, const QDict *qdict);
 void hmp_closefd(Monitor *mon, const QDict *qdict);
 void hmp_send_key(Monitor *mon, const QDict *qdict);
 void hmp_screen_dump(Monitor *mon, const QDict *qdict);
+void hmp_info_memory_hotplug(Monitor *mon);
 
 #endif
diff --git a/hw/dimm.c b/hw/dimm.c
index 288b997..fbd93a8 100644
--- a/hw/dimm.c
+++ b/hw/dimm.c
@@ -65,6 +65,7 @@ static void dimm_bus_initfn(Object *obj)
 DimmBus *bus = DIMM_BUS(obj);
 QTAILQ_INIT(bus-dimmconfig_list);
 QTAILQ_INIT(bus-dimmlist);
+QTAILQ_INIT(bus-dimm_hp_result_queue);
 
 QTAILQ_FOREACH_SAFE(dimm_cfg, dimmconfig_list, nextdimmcfg, 
next_dimm_cfg) {
 QTAILQ_REMOVE(dimmconfig_list, dimm_cfg, nextdimmcfg);
@@ -236,20 +237,78 @@ void dimm_notify(uint32_t idx, uint32_t event)
 {
 DimmBus *bus = main_memory_bus;
 DimmDevice *s;
+DimmConfig *slotcfg;
+struct dimm_hp_result *result;
+
 s = dimm_find_from_idx(idx);
 assert(s != NULL);
+result = g_malloc0(sizeof(*result));
+slotcfg = dimmcfg_find_from_name(DEVICE(s)-id);
+result-dimmname = slotcfg-name;
 
 switch(event) {
 case DIMM_REMOVE_SUCCESS:
 dimm_depopulate(s);
-qdev_simple_unplug_cb((DeviceState*)s);
 QTAILQ_REMOVE(bus-dimmlist, s, nextdimm);
+qdev_simple_unplug_cb((DeviceState*)s);
+QTAILQ_INSERT_TAIL(bus-dimm_hp_result_queue, result, next);
 break;
 default:
+g_free(result);
 break;
 }
 }
 
+MemHpInfoList *qmp_query_memory_hotplug(Error **errp)
+{
+DimmBus *bus = main_memory_bus;
+MemHpInfoList *head = NULL, *cur_item = NULL, *info;
+struct dimm_hp_result *item, *nextitem;
+
+QTAILQ_FOREACH_SAFE(item, bus-dimm_hp_result_queue, next, nextitem) {
+
+info = g_malloc0(sizeof(*info));
+info-value = g_malloc0(sizeof(*info-value));
+info-value-dimm = g_malloc0(sizeof(char) * 32);
+info-value-request = g_malloc0(sizeof(char) * 16);
+info-value-result = g_malloc0(sizeof(char) * 16);
+switch (item-ret) {
+case DIMM_REMOVE_SUCCESS:
+strcpy(info-value-request, hot-remove);
+strcpy(info-value-result, success);
+break;
+case DIMM_REMOVE_FAIL:
+strcpy(info-value-request, hot-remove);
+strcpy(info-value-result, failure);
+break;
+case DIMM_ADD_SUCCESS:
+strcpy(info-value-request, hot-add);
+

[RFC PATCH v3 17/19][SeaBIOS] Implement _PS3 method for memory device

Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com
---
 src/acpi-dsdt.dsl |   15 +++
 src/ssdt-mem.dsl  |4 
 2 files changed, 19 insertions(+), 0 deletions(-)

diff --git a/src/acpi-dsdt.dsl b/src/acpi-dsdt.dsl
index 0d37bbc..8a18770 100644
--- a/src/acpi-dsdt.dsl
+++ b/src/acpi-dsdt.dsl
@@ -784,6 +784,13 @@ DefinitionBlock (
 MIF, 8
 }
 
+/* Memory _PS3 byte */
+OperationRegion(MPSB, SystemIO, 0xafa4, 1)
+Field (MPSB, ByteAcc, NoLock, Preserve)
+{
+MPS, 8
+}
+
 Method(MESC, 0) {
 // Local5 = active memdevice bitmap
 Store (MES, Local5)
@@ -824,6 +831,14 @@ DefinitionBlock (
 Store(Arg0, MPE)
 Sleep(200)
 }
+
+Method (MPS3, 1, NotSerialized) {
+// _PS3 method - power-off method
+Store(Arg0, MPS)
+Store(Zero, Index(MEON, Arg0))
+Sleep(200)
+}
+
 Method (MOST, 3, Serialized) {
 // _OST method - OS status indication
 Switch (And(Arg0, 0xFF)) {
diff --git a/src/ssdt-mem.dsl b/src/ssdt-mem.dsl
index 041d301..7423fc6 100644
--- a/src/ssdt-mem.dsl
+++ b/src/ssdt-mem.dsl
@@ -39,6 +39,7 @@ DefinitionBlock (ssdt-mem.aml, SSDT, 0x02, BXPC, 
CSSDT, 0x1)
 External(CMST, MethodObj)
 External(MPEJ, MethodObj)
 External(MOST, MethodObj)
+External(MPS3, MethodObj)
 
 Name(_CRS, ResourceTemplate() {
 QwordMemory(
@@ -64,6 +65,9 @@ DefinitionBlock (ssdt-mem.aml, SSDT, 0x02, BXPC, 
CSSDT, 0x1)
 Method (_OST, 3) {
 MOST(Arg0, Arg1, ID)
 }
+Method (_PS3, 0) {
+MPS3(ID)
+}
 }
 }
 
-- 
1.7.9

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH v3 18/19] Implement _PS3 for dimm

This will allow us to update dimm state on OSPM-initiated eject operations e.g.
with echo 1  /sys/bus/acpi/devices/PNP0C80\:00/eject

Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com
---
 docs/specs/acpi_hotplug.txt |7 +++
 hw/acpi_piix4.c |5 +
 hw/dimm.c   |3 +++
 hw/dimm.h   |3 ++-
 4 files changed, 17 insertions(+), 1 deletions(-)

diff --git a/docs/specs/acpi_hotplug.txt b/docs/specs/acpi_hotplug.txt
index 536da16..69868fe 100644
--- a/docs/specs/acpi_hotplug.txt
+++ b/docs/specs/acpi_hotplug.txt
@@ -45,3 +45,10 @@ insertion failed.
 Written by ACPI memory device _OST method to notify qemu of failed
 hot-add.  Write-only.
 
+Memory Dimm _PS3 power-off initiated by OSPM (IO port 0xafa4, 1-byte access):
+---
+Dimm hot-add _PS3 initiated by OSPM. Byte value indicates Dimm slot which
+entered D3 state.
+
+Written by ACPI memory device _PS3 method to notify qemu of power-off state for
+the dimm.  Write-only.
diff --git a/hw/acpi_piix4.c b/hw/acpi_piix4.c
index 8bf58a6..aad78ca 100644
--- a/hw/acpi_piix4.c
+++ b/hw/acpi_piix4.c
@@ -52,6 +52,7 @@
 #define MEM_OST_REMOVE_FAIL 0xafa1
 #define MEM_OST_ADD_SUCCESS 0xafa2
 #define MEM_OST_ADD_FAIL 0xafa3
+#define MEM_PS3 0xafa4
 
 #define PIIX4_MEM_HOTPLUG_STATUS 8
 #define PIIX4_PCI_HOTPLUG_STATUS 2
@@ -545,6 +546,9 @@ static void gpe_writeb(void *opaque, uint32_t addr, 
uint32_t val)
 case MEM_OST_ADD_FAIL:
 dimm_notify(val, DIMM_ADD_FAIL);
 break;
+case MEM_PS3:
+dimm_notify(val, DIMM_OSPM_POWEROFF);
+break;
 default:
 acpi_gpe_ioport_writeb(s-ar, addr, val);
 }
@@ -621,6 +625,7 @@ static void piix4_acpi_system_hot_add_init(PCIBus *bus, 
PIIX4PMState *s)
 register_ioport_write(MEM_OST_REMOVE_FAIL, 1, 1,  gpe_writeb, s);
 register_ioport_write(MEM_OST_ADD_SUCCESS, 1, 1,  gpe_writeb, s);
 register_ioport_write(MEM_OST_ADD_FAIL, 1, 1,  gpe_writeb, s);
+register_ioport_write(MEM_PS3, 1, 1,  gpe_writeb, s);
 
 for(i = 0; i  DIMM_BITMAP_BYTES; i++) {
 s-gperegs.mems_sts[i] = 0;
diff --git a/hw/dimm.c b/hw/dimm.c
index b993668..08f66d5 100644
--- a/hw/dimm.c
+++ b/hw/dimm.c
@@ -319,6 +319,9 @@ void dimm_notify(uint32_t idx, uint32_t event)
 qdev_simple_unplug_cb((DeviceState*)s);
 QTAILQ_INSERT_TAIL(bus-dimm_hp_result_queue, result, next);
 break;
+case DIMM_OSPM_POWEROFF:
+if (bus-dimm_revert)
+bus-dimm_revert(bus-dimm_hotplug_qdev, s, 1);
 default:
 g_free(result);
 break;
diff --git a/hw/dimm.h b/hw/dimm.h
index ce091fe..8d73b8f 100644
--- a/hw/dimm.h
+++ b/hw/dimm.h
@@ -15,7 +15,8 @@ typedef enum {
 DIMM_REMOVE_SUCCESS = 0,
 DIMM_REMOVE_FAIL = 1,
 DIMM_ADD_SUCCESS = 2,
-DIMM_ADD_FAIL = 3
+DIMM_ADD_FAIL = 3,
+DIMM_OSPM_POWEROFF = 4
 } dimm_hp_result_code;
 
 typedef enum {
-- 
1.7.9

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH v3 08/19] pc: calculate dimm physical addresses and adjust memory map

Dimm physical address offsets are calculated automatically and memory map is
adjusted accordingly. If a DIMM can fit before the PCI_HOLE_START (currently
0xe000), it will be added normally, otherwise its physical address will be
above 4GB.

Also create memory bus on i440fx-pcihost device.

Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com
---
 hw/pc.c  |   41 +
 hw/pc.h  |6 ++
 hw/pc_piix.c |   20 ++--
 vl.c |1 +
 4 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/hw/pc.c b/hw/pc.c
index 112739a..2c9664d 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -52,6 +52,7 @@
 #include arch_init.h
 #include bitmap.h
 #include vga-pci.h
+#include dimm.h
 
 /* output Bochs bios info messages */
 //#define DEBUG_BIOS
@@ -93,6 +94,9 @@ struct e820_table {
 static struct e820_table e820_table;
 struct hpet_fw_config hpet_cfg = {.count = UINT8_MAX};
 
+ram_addr_t below_4g_hp_mem_size = 0;
+ram_addr_t above_4g_hp_mem_size = 0;
+extern target_phys_addr_t ram_hp_offset;
 void gsi_handler(void *opaque, int n, int level)
 {
 GSIState *s = opaque;
@@ -1160,3 +1164,40 @@ void pc_pci_device_init(PCIBus *pci_bus)
 pci_create_simple(pci_bus, -1, lsi53c895a);
 }
 }
+
+
+/* Function to configure memory offsets of hotpluggable dimms */
+
+target_phys_addr_t pc_set_hp_memory_offset(uint64_t size)
+{
+target_phys_addr_t ret;
+
+/* on first call, initialize ram_hp_offset */
+if (!ram_hp_offset) {
+if (ram_size = PCI_HOLE_START ) {
+ram_hp_offset = 0x1LL + (ram_size - PCI_HOLE_START);
+} else {
+ram_hp_offset = ram_size;
+}
+}
+
+if (ram_hp_offset = 0x1LL) {
+ret = ram_hp_offset;
+above_4g_hp_mem_size += size;
+ram_hp_offset += size;
+}
+/* if dimm fits before pci hole, append it normally */
+else if (ram_hp_offset + size = PCI_HOLE_START) {
+ret = ram_hp_offset;
+below_4g_hp_mem_size += size;
+ram_hp_offset += size;
+}
+/* otherwise place it above 4GB */
+else {
+ret = 0x1LL;
+above_4g_hp_mem_size += size;
+ram_hp_offset = 0x1LL + size;
+}
+
+return ret;
+}
diff --git a/hw/pc.h b/hw/pc.h
index e4db071..f3304fc 100644
--- a/hw/pc.h
+++ b/hw/pc.h
@@ -10,6 +10,7 @@
 #include memory.h
 #include ioapic.h
 
+#define PCI_HOLE_START 0xe000
 /* PC-style peripherals (also used by other machines).  */
 
 /* serial.c */
@@ -214,6 +215,11 @@ static inline bool isa_ne2000_init(ISABus *bus, int base, 
int irq, NICInfo *nd)
 /* pc_sysfw.c */
 void pc_system_firmware_init(MemoryRegion *rom_memory);
 
+/* memory hotplug */
+target_phys_addr_t pc_set_hp_memory_offset(uint64_t size);
+extern ram_addr_t below_4g_hp_mem_size;
+extern ram_addr_t above_4g_hp_mem_size;
+
 /* e820 types */
 #define E820_RAM1
 #define E820_RESERVED   2
diff --git a/hw/pc_piix.c b/hw/pc_piix.c
index 88ff041..d1fd276 100644
--- a/hw/pc_piix.c
+++ b/hw/pc_piix.c
@@ -43,6 +43,7 @@
 #include xen.h
 #include memory.h
 #include exec-memory.h
+#include dimm.h
 #ifdef CONFIG_XEN
 #  include xen/hvm/hvm_info_table.h
 #endif
@@ -155,9 +156,9 @@ static void pc_init1(MemoryRegion *system_memory,
 kvmclock_create();
 }
 
-if (ram_size = 0xe000 ) {
-above_4g_mem_size = ram_size - 0xe000;
-below_4g_mem_size = 0xe000;
+if (ram_size = PCI_HOLE_START ) {
+above_4g_mem_size = ram_size - PCI_HOLE_START;
+below_4g_mem_size = PCI_HOLE_START;
 } else {
 above_4g_mem_size = 0;
 below_4g_mem_size = ram_size;
@@ -172,6 +173,9 @@ static void pc_init1(MemoryRegion *system_memory,
 rom_memory = system_memory;
 }
 
+/* adjust memory map for hotplug dimms */
+dimm_calc_offsets(pc_set_hp_memory_offset);
+
 /* allocate ram and load rom/bios */
 if (!xen_enabled()) {
 fw_cfg = pc_memory_init(system_memory,
@@ -192,9 +196,11 @@ static void pc_init1(MemoryRegion *system_memory,
 if (pci_enabled) {
 pci_bus = i440fx_init(i440fx_state, piix3_devfn, isa_bus, gsi,
   system_memory, system_io, ram_size,
-  below_4g_mem_size,
-  0x1ULL - below_4g_mem_size,
-  0x1ULL + above_4g_mem_size,
+  below_4g_mem_size + below_4g_hp_mem_size,
+  0x1ULL - below_4g_mem_size
+- below_4g_hp_mem_size,
+  0x1ULL + above_4g_mem_size
++ above_4g_hp_mem_size,
   (sizeof(target_phys_addr_t) == 4
? 0
: ((uint64_t)1  62)),
@@ -223,6 +229,8 @@ static void pc_init1(MemoryRegion *system_memory,

[RFC PATCH v3 19/19][SeaBIOS] Calculate pcimem_start and pcimem64_start from SRAT entries

pcimem_start and pcimem64_start are adjusted from srat entries. For this reason,
paravirt info (NUMA SRAT entries and number of cpus) need to be read before 
pci_setup.
Imho, this is an ugly code change since SRAT bios tables and number of
cpus have to be read earlier. But the advantage is that no new paravirt 
interface
is introduced. Suggestions to make the code change cleaner are welcome.

The alternative patch (will be sent as a reply to this patch) implements a
paravirt interface to read the starting values of pcimem_start and
pcimem64_start from QEMU.

Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com
---
 src/acpi.c|   82 
 src/acpi.h|3 ++
 src/pciinit.c |6 +++-
 src/post.c|3 ++
 src/smp.c |4 +++
 5 files changed, 72 insertions(+), 26 deletions(-)

diff --git a/src/acpi.c b/src/acpi.c
index 1223b52..9e99aa7 100644
--- a/src/acpi.c
+++ b/src/acpi.c
@@ -428,7 +428,10 @@ encodeLen(u8 *ssdt_ptr, int length, int bytes)
 #define MEM_OFFSET_END   63
 #define MEM_OFFSET_SIZE  79
 
-u64 nb_hp_memslots = 0;
+u64 nb_hp_memslots = 0, nb_numanodes;
+u64 *numa_data, *hp_memdata;
+u64 below_4g_hp_mem_size = 0;
+u64 above_4g_hp_mem_size = 0;
 struct srat_memory_affinity *mem;
 
 #define SSDT_SIGNATURE 0x54445353 // SSDT
@@ -763,17 +766,7 @@ acpi_build_srat_memory(struct srat_memory_affinity 
*numamem,
 static void *
 build_srat(void)
 {
-int nb_numa_nodes = qemu_cfg_get_numa_nodes();
-
-u64 *numadata = malloc_tmphigh(sizeof(u64) * (MaxCountCPUs + 
nb_numa_nodes));
-if (!numadata) {
-warn_noalloc();
-return NULL;
-}
-
-qemu_cfg_get_numa_data(numadata, MaxCountCPUs + nb_numa_nodes);
-
-qemu_cfg_get_numa_data(nb_hp_memslots, 1);
+int nb_numa_nodes = nb_numanodes;
 struct system_resource_affinity_table *srat;
 int srat_size = sizeof(*srat) +
 sizeof(struct srat_processor_affinity) * MaxCountCPUs +
@@ -782,7 +775,7 @@ build_srat(void)
 srat = malloc_high(srat_size);
 if (!srat) {
 warn_noalloc();
-free(numadata);
+free(numa_data);
 return NULL;
 }
 
@@ -791,6 +784,7 @@ build_srat(void)
 struct srat_processor_affinity *core = (void*)(srat + 1);
 int i;
 u64 curnode;
+u64 *numadata = numa_data;
 
 for (i = 0; i  MaxCountCPUs; ++i) {
 core-type = SRAT_PROCESSOR;
@@ -847,15 +841,7 @@ build_srat(void)
 mem = (void*)numamem;
 
 if (nb_hp_memslots) {
-u64 *hpmemdata = malloc_tmphigh(sizeof(u64) * (3 * nb_hp_memslots));
-if (!hpmemdata) {
-warn_noalloc();
-free(hpmemdata);
-free(numadata);
-return NULL;
-}
-
-qemu_cfg_get_numa_data(hpmemdata, 3 * nb_hp_memslots);
+u64 *hpmemdata = hp_memdata;
 
 for (i = 1; i  nb_hp_memslots + 1; ++i) {
 mem_base = *hpmemdata++;
@@ -865,7 +851,7 @@ build_srat(void)
 numamem++;
 slots++;
 }
-free(hpmemdata);
+free(hp_memdata);
 }
 
 for (; slots  nb_numa_nodes + nb_hp_memslots + 2; slots++) {
@@ -875,10 +861,58 @@ build_srat(void)
 
 build_header((void*)srat, SRAT_SIGNATURE, srat_size, 1);
 
-free(numadata);
+free(numa_data);
 return srat;
 }
 
+/* QEMU paravirt SRAT entries need to be read in before pci initilization */
+void read_srat_early(void)
+{
+int i;
+
+nb_numanodes = qemu_cfg_get_numa_nodes();
+u64 *hpmemdata;
+u64 mem_len, mem_base;
+
+numa_data = malloc_tmphigh(sizeof(u64) * (MaxCountCPUs + nb_numanodes));
+if (!numa_data) {
+warn_noalloc();
+}
+
+qemu_cfg_get_numa_data(numa_data, MaxCountCPUs + nb_numanodes);
+qemu_cfg_get_numa_data(nb_hp_memslots, 1);
+
+if (nb_hp_memslots) {
+hp_memdata = malloc_tmphigh(sizeof(u64) * (3 * nb_hp_memslots));
+if (!hp_memdata) {
+warn_noalloc();
+free(hp_memdata);
+free(numa_data);
+}
+
+qemu_cfg_get_numa_data(hp_memdata, 3 * nb_hp_memslots);
+hpmemdata = hp_memdata;
+
+for (i = 1; i  nb_hp_memslots + 1; ++i) {
+mem_base = *hpmemdata++;
+mem_len = *hpmemdata++;
+hpmemdata++;
+if (mem_base = 0x1LL) {
+above_4g_hp_mem_size += mem_len;
+}
+/* if dimm fits before pci hole, append it normally */
+else if (mem_base + mem_len = BUILD_PCIMEM_START) {
+below_4g_hp_mem_size += mem_len;
+}
+/* otherwise place it above 4GB */
+else {
+above_4g_hp_mem_size += mem_len;
+}
+}
+
+}
+}
+
 static const struct pci_device_id acpi_find_tbl[] = {
 /* PIIX4 Power Management device. */
 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371AB_3, NULL),
diff --git a/src/acpi.h b/src/acpi.h
index cb21561..d29837f

[RFC PATCH v3 16/19] Update dimm state on reset

in case of hot-remove failure on a guest that does not implement _OST,
the dimm bitmaps in qemu and Seabios show the dimm as unplugged, but the dimm
is still present on the qdev/memory bus. To avoid this inconsistency, we set the
dimm state to active/hot-plugged on a reset of the associated acpi_pm device.
This way the dimm is still active after a VM reboot and dimm visibility has
always the same behaviour, regardless of _OST support in the guest.

Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com
---
 hw/acpi_piix4.c |1 +
 hw/dimm.c   |   20 
 hw/dimm.h   |1 +
 3 files changed, 22 insertions(+), 0 deletions(-)

diff --git a/hw/acpi_piix4.c b/hw/acpi_piix4.c
index f7220d4..8bf58a6 100644
--- a/hw/acpi_piix4.c
+++ b/hw/acpi_piix4.c
@@ -373,6 +373,7 @@ static void piix4_reset(void *opaque)
 pci_conf[0x5B] = 0x02;
 }
 piix4_update_hotplug(s);
+dimm_state_sync();
 }
 
 static void piix4_powerdown(void *opaque, int irq, int power_failing)
diff --git a/hw/dimm.c b/hw/dimm.c
index 1521462..b993668 100644
--- a/hw/dimm.c
+++ b/hw/dimm.c
@@ -182,6 +182,26 @@ static DimmDevice *dimm_find_from_idx(uint32_t idx)
 return NULL;
 }
 
+void dimm_state_sync(void)
+{
+DimmBus *bus = main_memory_bus;
+DimmDevice *slot;
+
+/* if a hot-remove operation is pending on reset, it means the hot-remove
+ * operation has failed, but the guest hasn't notified us e.g. because the
+ * guest does not provide _OST notifications. The device is still present 
on
+ * the dimmbus, but the qemu and Seabios dimm bitmaps show this device as
+ * unplugged. To avoid this inconsistency, we set the dimm bits to active
+ * i.e. hot-plugged for each dimm present on the dimmbus.
+ */
+QTAILQ_FOREACH(slot, bus-dimmlist, nextdimm) {
+if (slot-pending == DIMM_REMOVE_PENDING) {
+if (bus-dimm_revert)
+bus-dimm_revert(bus-dimm_hotplug_qdev, slot, 0);
+}
+}
+}
+
 /* used to create a dimm device, only on incoming migration of a hotplugged
  * RAMBlock
  */
diff --git a/hw/dimm.h b/hw/dimm.h
index a6c6e6f..ce091fe 100644
--- a/hw/dimm.h
+++ b/hw/dimm.h
@@ -95,5 +95,6 @@ void main_memory_bus_create(Object *parent);
 void dimm_config_create(char *id, uint64_t size, uint64_t node,
 uint32_t dimm_idx, uint32_t populated);
 uint64_t get_hp_memory_total(void);
+void dimm_state_sync(void);
 
 #endif
-- 
1.7.9

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH v3 15/19] Add _OST dimm support

This allows qemu to receive notifications from the guest OS on success or
failure of a memory hotplug request. The guest OS needs to implement the _OST
functionality for this to work (linux-next: http://lkml.org/lkml/2012/6/25/321)

This patch also updates dimm bitmap state and hot-remove pending flag
on hot-remove fail.  This allows failed hot operations to be retried at
anytime. This only works for guests that use _OST notification.
Also adds new _OST registers in  docs/specs/acpi_hotplug.txt

Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com
---
 docs/specs/acpi_hotplug.txt |   25 +
 hw/acpi_piix4.c |   35 ++-
 hw/dimm.c   |   28 +++-
 hw/dimm.h   |   10 +-
 4 files changed, 95 insertions(+), 3 deletions(-)

diff --git a/docs/specs/acpi_hotplug.txt b/docs/specs/acpi_hotplug.txt
index cf86242..536da16 100644
--- a/docs/specs/acpi_hotplug.txt
+++ b/docs/specs/acpi_hotplug.txt
@@ -20,3 +20,28 @@ ejected.
 
 Written by ACPI memory device _EJ0 method to notify qemu of successfull
 hot-removal.  Write-only.
+
+Memory Dimm ejection failure notification (IO port 0xafa1, 1-byte access):
+---
+Dimm hot-remove _OST notification. Byte value indicates Dimm slot for which
+ejection failed.
+
+Written by ACPI memory device _OST method to notify qemu of failed
+hot-removal.  Write-only.
+
+Memory Dimm insertion success notification (IO port 0xafa2, 1-byte access):
+---
+Dimm hot-remove _OST notification. Byte value indicates Dimm slot for which
+insertion succeeded.
+
+Written by ACPI memory device _OST method to notify qemu of failed
+hot-add.  Write-only.
+
+Memory Dimm insertion failure notification (IO port 0xafa3, 1-byte access):
+---
+Dimm hot-remove _OST notification. Byte value indicates Dimm slot for which
+insertion failed.
+
+Written by ACPI memory device _OST method to notify qemu of failed
+hot-add.  Write-only.
+
diff --git a/hw/acpi_piix4.c b/hw/acpi_piix4.c
index 8776669..f7220d4 100644
--- a/hw/acpi_piix4.c
+++ b/hw/acpi_piix4.c
@@ -49,6 +49,9 @@
 #define PCI_RMV_BASE 0xae0c
 #define MEM_BASE 0xaf80
 #define MEM_EJ_BASE 0xafa0
+#define MEM_OST_REMOVE_FAIL 0xafa1
+#define MEM_OST_ADD_SUCCESS 0xafa2
+#define MEM_OST_ADD_FAIL 0xafa3
 
 #define PIIX4_MEM_HOTPLUG_STATUS 8
 #define PIIX4_PCI_HOTPLUG_STATUS 2
@@ -87,6 +90,7 @@ typedef struct PIIX4PMState {
 uint8_t s4_val;
 } PIIX4PMState;
 
+static int piix4_dimm_revert(DeviceState *qdev, DimmDevice *dev, int add);
 static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s);
 
 #define ACPI_ENABLE 0xf1
@@ -531,6 +535,15 @@ static void gpe_writeb(void *opaque, uint32_t addr, 
uint32_t val)
 case MEM_EJ_BASE:
 dimm_notify(val, DIMM_REMOVE_SUCCESS);
 break;
+case MEM_OST_REMOVE_FAIL:
+dimm_notify(val, DIMM_REMOVE_FAIL);
+break;
+case MEM_OST_ADD_SUCCESS:
+dimm_notify(val, DIMM_ADD_SUCCESS);
+break;
+case MEM_OST_ADD_FAIL:
+dimm_notify(val, DIMM_ADD_FAIL);
+break;
 default:
 acpi_gpe_ioport_writeb(s-ar, addr, val);
 }
@@ -604,13 +617,16 @@ static void piix4_acpi_system_hot_add_init(PCIBus *bus, 
PIIX4PMState *s)
 
 register_ioport_read(MEM_BASE, DIMM_BITMAP_BYTES, 1,  gpe_readb, s);
 register_ioport_write(MEM_EJ_BASE, 1, 1,  gpe_writeb, s);
+register_ioport_write(MEM_OST_REMOVE_FAIL, 1, 1,  gpe_writeb, s);
+register_ioport_write(MEM_OST_ADD_SUCCESS, 1, 1,  gpe_writeb, s);
+register_ioport_write(MEM_OST_ADD_FAIL, 1, 1,  gpe_writeb, s);
 
 for(i = 0; i  DIMM_BITMAP_BYTES; i++) {
 s-gperegs.mems_sts[i] = 0;
 }
 
 pci_bus_hotplug(bus, piix4_device_hotplug, s-dev.qdev);
-dimm_bus_hotplug(piix4_dimm_hotplug, s-dev.qdev);
+dimm_bus_hotplug(piix4_dimm_hotplug, piix4_dimm_revert, s-dev.qdev);
 }
 
 static void enable_device(PIIX4PMState *s, int slot)
@@ -656,6 +672,23 @@ static int piix4_dimm_hotplug(DeviceState *qdev, 
DimmDevice *dev, int
 return 0;
 }
 
+static int piix4_dimm_revert(DeviceState *qdev, DimmDevice *dev, int add)
+{
+PCIDevice *pci_dev = DO_UPCAST(PCIDevice, qdev, qdev);
+PIIX4PMState *s = DO_UPCAST(PIIX4PMState, dev, pci_dev);
+struct gpe_regs *g = s-gperegs;
+DimmDevice *slot = DIMM(dev);
+int idx = slot-idx;
+
+if (add) {
+g-mems_sts[idx/8] = ~(1  (idx%8));
+}
+else {
+g-mems_sts[idx/8] |= (1  (idx%8));
+}
+return 0;
+}
+
 static int piix4_device_hotplug(DeviceState *qdev, PCIDevice *dev,
PCIHotplugState state)
 {
diff --git a/hw/dimm.c b/hw/dimm.c
index 21626f6..1521462 100644
--- a/hw/dimm.c
+++ b/hw/dimm.c
@@

[RFC PATCH v3 14/19][SeaBIOS] Add _OST dimm method

Add support for _OST method. _OST method will write into the correct I/O byte to
signal success / failure of hot-add or hot-remove to qemu.

Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com
---
 src/acpi-dsdt.dsl |   50 ++
 src/ssdt-mem.dsl  |4 
 2 files changed, 54 insertions(+), 0 deletions(-)

diff --git a/src/acpi-dsdt.dsl b/src/acpi-dsdt.dsl
index 5d3e92b..0d37bbc 100644
--- a/src/acpi-dsdt.dsl
+++ b/src/acpi-dsdt.dsl
@@ -762,6 +762,28 @@ DefinitionBlock (
 MPE, 8
 }
 
+
+/* Memory hot-remove notify failure byte */
+OperationRegion(MEEF, SystemIO, 0xafa1, 1)
+Field (MEEF, ByteAcc, NoLock, Preserve)
+{
+MEF, 8
+}
+
+/* Memory hot-add notify success byte */
+OperationRegion(MPIS, SystemIO, 0xafa2, 1)
+Field (MPIS, ByteAcc, NoLock, Preserve)
+{
+MIS, 8
+}
+
+/* Memory hot-add notify failure byte */
+OperationRegion(MPIF, SystemIO, 0xafa3, 1)
+Field (MPIF, ByteAcc, NoLock, Preserve)
+{
+MIF, 8
+}
+
 Method(MESC, 0) {
 // Local5 = active memdevice bitmap
 Store (MES, Local5)
@@ -802,6 +824,34 @@ DefinitionBlock (
 Store(Arg0, MPE)
 Sleep(200)
 }
+Method (MOST, 3, Serialized) {
+// _OST method - OS status indication
+Switch (And(Arg0, 0xFF)) {
+Case(0x3)
+{
+Switch(And(Arg1, 0xFF)) {
+Case(0x1) {
+Store(Arg2, MEF)
+// Revert MEON flag for this memory device to one
+Store(One, Index(MEON, Arg2))
+}
+}
+}
+Case(0x1)
+{
+Switch(And(Arg1, 0xFF)) {
+Case(0x0) {
+Store(Arg2, MIS)
+}
+Case(0x1) {
+Store(Arg2, MIF)
+// Revert MEON flag for this memory device to zero
+Store(Zero, Index(MEON, Arg2))
+}
+}
+}
+}
+}
 }
 
 
diff --git a/src/ssdt-mem.dsl b/src/ssdt-mem.dsl
index ee322f0..041d301 100644
--- a/src/ssdt-mem.dsl
+++ b/src/ssdt-mem.dsl
@@ -38,6 +38,7 @@ DefinitionBlock (ssdt-mem.aml, SSDT, 0x02, BXPC, 
CSSDT, 0x1)
 
 External(CMST, MethodObj)
 External(MPEJ, MethodObj)
+External(MOST, MethodObj)
 
 Name(_CRS, ResourceTemplate() {
 QwordMemory(
@@ -60,6 +61,9 @@ DefinitionBlock (ssdt-mem.aml, SSDT, 0x02, BXPC, 
CSSDT, 0x1)
 Method (_EJ0, 1, NotSerialized) {
 MPEJ(ID, Arg0)
 }
+Method (_OST, 3) {
+MOST(Arg0, Arg1, ID)
+}
 }
 }
 
-- 
1.7.9

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH v3 13/19] balloon: update with hotplugged memory

query-balloon and info balloon should report total memory available to the
guest.

balloon inflate/ deflate can also use all memory available to the guest (initial
+ hotplugged memory)

Ballon driver has been minimaly tested with the patch, please review and test.

Caveat: if the guest does not online hotplugged-memory, it's easy for a balloon
inflate command to OOM a guest.

Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com
---
 hw/virtio-balloon.c |   13 +
 1 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/hw/virtio-balloon.c b/hw/virtio-balloon.c
index dd1a650..bca21bc 100644
--- a/hw/virtio-balloon.c
+++ b/hw/virtio-balloon.c
@@ -22,6 +22,7 @@
 #include virtio-balloon.h
 #include kvm.h
 #include exec-memory.h
+#include dimm.h
 
 #if defined(__linux__)
 #include sys/mman.h
@@ -147,10 +148,11 @@ static void virtio_balloon_set_config(VirtIODevice *vdev,
 VirtIOBalloon *dev = to_virtio_balloon(vdev);
 struct virtio_balloon_config config;
 uint32_t oldactual = dev-actual;
+uint64_t hotplugged_ram_size = get_hp_memory_total();
 memcpy(config, config_data, 8);
 dev-actual = le32_to_cpu(config.actual);
 if (dev-actual != oldactual) {
-qemu_balloon_changed(ram_size -
+qemu_balloon_changed(ram_size + hotplugged_ram_size -
  (dev-actual  VIRTIO_BALLOON_PFN_SHIFT));
 }
 }
@@ -188,17 +190,20 @@ static void virtio_balloon_stat(void *opaque, BalloonInfo 
*info)
 
 info-actual = ram_size - ((uint64_t) dev-actual 
VIRTIO_BALLOON_PFN_SHIFT);
+info-actual += get_hp_memory_total(); 
 }
 
 static void virtio_balloon_to_target(void *opaque, ram_addr_t target)
 {
 VirtIOBalloon *dev = opaque;
+uint64_t hotplugged_ram_size = get_hp_memory_total();
 
-if (target  ram_size) {
-target = ram_size;
+if (target  ram_size + hotplugged_ram_size) {
+target = ram_size + hotplugged_ram_size;
 }
 if (target) {
-dev-num_pages = (ram_size - target)  VIRTIO_BALLOON_PFN_SHIFT;
+dev-num_pages = (ram_size + hotplugged_ram_size - target) 
+ VIRTIO_BALLOON_PFN_SHIFT;
 virtio_notify_config(dev-vdev);
 }
 }
-- 
1.7.9

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH v3 12/19] Implement info memory-total and query-memory-total

Returns total physical memory available to guest in bytes, including hotplugged
memory. Note that the number reported here may be different from what the guest
sees e.g. if the guest has not logically onlined hotplugged memory.

This functionality is provided independently of a balloon device, since a
guest can be using ACPI memory hotplug without using a balloon device.

Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com
---
 hmp-commands.hx  |2 ++
 hmp.c|7 +++
 hmp.h|1 +
 hw/dimm.c|   21 +
 hw/dimm.h|1 +
 monitor.c|7 +++
 qapi-schema.json |   11 +++
 qmp-commands.hx  |   20 
 8 files changed, 70 insertions(+), 0 deletions(-)

diff --git a/hmp-commands.hx b/hmp-commands.hx
index cfb1b67..988d207 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1464,6 +1464,8 @@ show qdev device model list
 show roms
 @item info memory-hotplug
 show memory-hotplug
+@item info memory-total
+show memory-total
 @end table
 ETEXI
 
diff --git a/hmp.c b/hmp.c
index 4b3d63d..cc31ddc 100644
--- a/hmp.c
+++ b/hmp.c
@@ -1185,3 +1185,10 @@ void hmp_info_memory_hotplug(Monitor *mon)
 
 qapi_free_MemHpInfoList(info);
 }
+
+void hmp_info_memory_total(Monitor *mon)
+{
+uint64_t ram_total;
+ram_total = (uint64_t)qmp_query_memory_total(NULL);
+monitor_printf(mon, MemTotal: %lu \n, ram_total);
+}
diff --git a/hmp.h b/hmp.h
index 986705a..ab96dba 100644
--- a/hmp.h
+++ b/hmp.h
@@ -74,5 +74,6 @@ void hmp_closefd(Monitor *mon, const QDict *qdict);
 void hmp_send_key(Monitor *mon, const QDict *qdict);
 void hmp_screen_dump(Monitor *mon, const QDict *qdict);
 void hmp_info_memory_hotplug(Monitor *mon);
+void hmp_info_memory_total(Monitor *mon);
 
 #endif
diff --git a/hw/dimm.c b/hw/dimm.c
index fbd93a8..21626f6 100644
--- a/hw/dimm.c
+++ b/hw/dimm.c
@@ -28,6 +28,7 @@ static DimmBus *main_memory_bus;
 /* the following list is used to hold dimm config info before machine
  * initialization. After machine init, the list is emptied and not used 
anymore.*/
 static DimmConfiglist dimmconfig_list = 
QTAILQ_HEAD_INITIALIZER(dimmconfig_list);
+extern ram_addr_t ram_size;
 
 static void dimmbus_dev_print(Monitor *mon, DeviceState *dev, int indent);
 static char *dimmbus_get_fw_dev_path(DeviceState *dev);
@@ -233,6 +234,26 @@ void setup_fwcfg_hp_dimms(uint64_t *fw_cfg_slots)
 }
 }
 
+uint64_t get_hp_memory_total(void)
+{
+DimmBus *bus = main_memory_bus;
+DimmDevice *slot;
+uint64_t info = 0;
+
+QTAILQ_FOREACH(slot, bus-dimmlist, nextdimm) {
+info += slot-size;
+}
+return info;
+}
+
+int64_t qmp_query_memory_total(Error **errp)
+{
+uint64_t info;
+info = ram_size + get_hp_memory_total();
+
+return (int64_t)info;
+}
+
 void dimm_notify(uint32_t idx, uint32_t event)
 {
 DimmBus *bus = main_memory_bus;
diff --git a/hw/dimm.h b/hw/dimm.h
index 95251ba..21225be 100644
--- a/hw/dimm.h
+++ b/hw/dimm.h
@@ -86,5 +86,6 @@ int dimm_add(char *id);
 void main_memory_bus_create(Object *parent);
 void dimm_config_create(char *id, uint64_t size, uint64_t node,
 uint32_t dimm_idx, uint32_t populated);
+uint64_t get_hp_memory_total(void);
 
 #endif
diff --git a/monitor.c b/monitor.c
index be9a1d9..4f5ea60 100644
--- a/monitor.c
+++ b/monitor.c
@@ -2747,6 +2747,13 @@ static mon_cmd_t info_cmds[] = {
 .mhandler.info = hmp_info_memory_hotplug,
 },
 {
+.name   = memory-total,
+.args_type  = ,
+.params = ,
+.help   = show total memory size,
+.mhandler.info = hmp_info_memory_total,
+},
+{
 .name   = NULL,
 },
 };
diff --git a/qapi-schema.json b/qapi-schema.json
index 3706a2a..c1d2571 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -2581,3 +2581,14 @@
 # Since: 1.3
 ##
 { 'command': 'query-memory-hotplug', 'returns': ['MemHpInfo'] }
+
+##
+# @query-memory-total:
+#
+# Returns total memory in bytes, including hotplugged dimms
+#
+# Returns: int
+#
+# Since: 1.3
+##
+{ 'command': 'query-memory-total', 'returns': 'int' }
diff --git a/qmp-commands.hx b/qmp-commands.hx
index e50dcc2..20b7eea 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -2576,3 +2576,23 @@ Example:
}
 
 EQMP
+
+{
+.name   = query-memory-total,
+.args_type  = ,
+.mhandler.cmd_new = qmp_marshal_input_query_memory_total
+},
+SQMP
+query-memory-total
+--
+
+Return total memory in bytes, including hotplugged dimms
+
+Example:
+
+- { execute: query-memory-total }
+- {
+  return: 1073741824
+   }
+
+EQMP
-- 
1.7.9

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH v3 07/19] acpi_piix4: Implement memory device hotplug registers

A 32-byte register is used to present up to 256 hotplug-able memory devices
to BIOS and OSPM. Hot-add and hot-remove functions trigger an ACPI hotplug
event through these. Only reads are allowed from these registers.

An ACPI hot-remove event but needs to wait for OSPM to eject the device.
We use a single-byte register to know when OSPM has called the _EJ function
for a particular dimm. A write to this byte will depopulate the respective dimm.
Only writes are allowed to this byte.

v1-v2:
mems_sts address moved from 0xaf20 to 0xaf80 (to accomodate more space for
cpu-hotplugging in the future).
_EJ array is reduced to a single byte.
Add documentation in docs/specs/acpi_hotplug.txt

v2-v3:
minor name changes

Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com
---
 docs/specs/acpi_hotplug.txt |   22 +
 hw/acpi_piix4.c |   73 --
 2 files changed, 91 insertions(+), 4 deletions(-)
 create mode 100644 docs/specs/acpi_hotplug.txt

diff --git a/docs/specs/acpi_hotplug.txt b/docs/specs/acpi_hotplug.txt
new file mode 100644
index 000..cf86242
--- /dev/null
+++ b/docs/specs/acpi_hotplug.txt
@@ -0,0 +1,22 @@
+QEMU-ACPI BIOS hotplug interface
+--
+This document describes the interface between QEMU and the ACPI BIOS for 
non-PCI
+space. For the PCI interface please look at docs/specs/acpi_pci_hotplug.txt
+
+QEMU-ACPI BIOS memory hotplug interface
+--
+
+Memory Dimm status array (IO port 0xaf80-0xaf9f, 1-byte access):
+---
+Dimm hot-plug notification pending. One bit per slot.
+
+Read by ACPI BIOS GPE.3 handler to notify OS of memory hot-add or hot-remove
+events.  Read-only.
+
+Memory Dimm ejection success notification (IO port 0xafa0, 1-byte access):
+---
+Dimm hot-remove _EJ0 notification. Byte value indicates Dimm slot that was
+ejected.
+
+Written by ACPI memory device _EJ0 method to notify qemu of successfull
+hot-removal.  Write-only.
diff --git a/hw/acpi_piix4.c b/hw/acpi_piix4.c
index c56220b..8776669 100644
--- a/hw/acpi_piix4.c
+++ b/hw/acpi_piix4.c
@@ -28,6 +28,8 @@
 #include range.h
 #include ioport.h
 #include fw_cfg.h
+#include sysbus.h
+#include dimm.h
 
 //#define DEBUG
 
@@ -45,9 +47,15 @@
 #define PCI_DOWN_BASE 0xae04
 #define PCI_EJ_BASE 0xae08
 #define PCI_RMV_BASE 0xae0c
+#define MEM_BASE 0xaf80
+#define MEM_EJ_BASE 0xafa0
 
+#define PIIX4_MEM_HOTPLUG_STATUS 8
 #define PIIX4_PCI_HOTPLUG_STATUS 2
 
+struct gpe_regs {
+uint8_t mems_sts[DIMM_BITMAP_BYTES];
+};
 struct pci_status {
 uint32_t up; /* deprecated, maintained for migration compatibility */
 uint32_t down;
@@ -69,6 +77,7 @@ typedef struct PIIX4PMState {
 Notifier machine_ready;
 
 /* for pci hotplug */
+struct gpe_regs gperegs;
 struct pci_status pci0_status;
 uint32_t pci0_hotplug_enable;
 uint32_t pci0_slot_device_present;
@@ -93,8 +102,8 @@ static void pm_update_sci(PIIX4PMState *s)
ACPI_BITMASK_POWER_BUTTON_ENABLE |
ACPI_BITMASK_GLOBAL_LOCK_ENABLE |
ACPI_BITMASK_TIMER_ENABLE)) != 0) ||
-(((s-ar.gpe.sts[0]  s-ar.gpe.en[0])
-   PIIX4_PCI_HOTPLUG_STATUS) != 0);
+(((s-ar.gpe.sts[0]  s-ar.gpe.en[0]) 
+  (PIIX4_PCI_HOTPLUG_STATUS | PIIX4_MEM_HOTPLUG_STATUS)) != 0);
 
 qemu_set_irq(s-irq, sci_level);
 /* schedule a timer interruption if needed */
@@ -499,7 +508,16 @@ type_init(piix4_pm_register_types)
 static uint32_t gpe_readb(void *opaque, uint32_t addr)
 {
 PIIX4PMState *s = opaque;
-uint32_t val = acpi_gpe_ioport_readb(s-ar, addr);
+uint32_t val = 0;
+struct gpe_regs *g = s-gperegs;
+
+switch (addr) {
+case MEM_BASE ... MEM_BASE+DIMM_BITMAP_BYTES:
+val = g-mems_sts[addr - MEM_BASE];
+break;
+default:
+val = acpi_gpe_ioport_readb(s-ar, addr);
+}
 
 PIIX4_DPRINTF(gpe read %x == %x\n, addr, val);
 return val;
@@ -509,7 +527,13 @@ static void gpe_writeb(void *opaque, uint32_t addr, 
uint32_t val)
 {
 PIIX4PMState *s = opaque;
 
-acpi_gpe_ioport_writeb(s-ar, addr, val);
+switch (addr) {
+case MEM_EJ_BASE:
+dimm_notify(val, DIMM_REMOVE_SUCCESS);
+break;
+default:
+acpi_gpe_ioport_writeb(s-ar, addr, val);
+}
 pm_update_sci(s);
 
 PIIX4_DPRINTF(gpe write %x == %d\n, addr, val);
@@ -560,9 +584,11 @@ static uint32_t pcirmv_read(void *opaque, uint32_t addr)
 
 static int piix4_device_hotplug(DeviceState *qdev, PCIDevice *dev,
 PCIHotplugState state);
+static int piix4_dimm_hotplug(DeviceState *qdev, DimmDevice *dev, int add);
 
 static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s)
 {
+int i = 0;

[RFC PATCH v3 05/19] Implement dimm device abstraction

Each hotplug-able memory slot is a DimmDevice. All DimmDevices are attached
to a new bus called DimmBus. This bus is introduced so that we no longer
depend on hotplug-capability of main system bus (the main bus does not allow
hotplugging). The DimmBus should be attached to a chipset Device (i440fx in case
of the pc)

A hot-add operation for a particular dimm:
- creates a new DimmDevice and attaches it to the DimmBus
- creates a new MemoryRegion of the given physical address offset, size and
node proximity, and attaches it to main system memory as a sub_region.

A successful hot-remove operation detaches and frees the MemoryRegion from
system memory, and removes the DimmDevice from the DimmBus.

Hotplug operations are done through normal device_add /device_del commands.
Also add properties to DimmDevice.

Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com
---
 hw/dimm.c |  305 +
 hw/dimm.h |   90 ++
 2 files changed, 395 insertions(+), 0 deletions(-)
 create mode 100644 hw/dimm.c
 create mode 100644 hw/dimm.h

diff --git a/hw/dimm.c b/hw/dimm.c
new file mode 100644
index 000..288b997
--- /dev/null
+++ b/hw/dimm.c
@@ -0,0 +1,305 @@
+/*
+ * Dimm device for Memory Hotplug
+ *
+ * Copyright ProfitBricks GmbH 2012
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see http://www.gnu.org/licenses/
+ */
+
+#include trace.h
+#include qdev.h
+#include dimm.h
+#include time.h
+#include ../exec-memory.h
+#include qmp-commands.h
+
+/* the system-wide memory bus. */
+static DimmBus *main_memory_bus;
+/* the following list is used to hold dimm config info before machine
+ * initialization. After machine init, the list is emptied and not used 
anymore.*/
+static DimmConfiglist dimmconfig_list = 
QTAILQ_HEAD_INITIALIZER(dimmconfig_list);
+
+static void dimmbus_dev_print(Monitor *mon, DeviceState *dev, int indent);
+static char *dimmbus_get_fw_dev_path(DeviceState *dev);
+
+static Property dimm_properties[] = {
+DEFINE_PROP_UINT64(start, DimmDevice, start, 0),
+DEFINE_PROP_UINT64(size, DimmDevice, size, DEFAULT_DIMMSIZE),
+DEFINE_PROP_UINT32(node, DimmDevice, node, 0),
+DEFINE_PROP_END_OF_LIST(),
+};
+
+static void dimmbus_dev_print(Monitor *mon, DeviceState *dev, int indent)
+{
+}
+
+static char *dimmbus_get_fw_dev_path(DeviceState *dev)
+{
+char path[40];
+
+snprintf(path, sizeof(path), %s, qdev_fw_name(dev));
+return strdup(path);
+}
+
+static void dimm_bus_class_init(ObjectClass *klass, void *data)
+{
+BusClass *k = BUS_CLASS(klass);
+
+k-print_dev = dimmbus_dev_print;
+k-get_fw_dev_path = dimmbus_get_fw_dev_path;
+}
+
+static void dimm_bus_initfn(Object *obj)
+{
+DimmConfig *dimm_cfg, *next_dimm_cfg;
+DimmBus *bus = DIMM_BUS(obj);
+QTAILQ_INIT(bus-dimmconfig_list);
+QTAILQ_INIT(bus-dimmlist);
+
+QTAILQ_FOREACH_SAFE(dimm_cfg, dimmconfig_list, nextdimmcfg, 
next_dimm_cfg) {
+QTAILQ_REMOVE(dimmconfig_list, dimm_cfg, nextdimmcfg);
+QTAILQ_INSERT_TAIL(bus-dimmconfig_list, dimm_cfg, nextdimmcfg);
+}
+}
+
+static const TypeInfo dimm_bus_info = {
+.name = TYPE_DIMM_BUS,
+.parent = TYPE_BUS,
+.instance_size = sizeof(DimmBus),
+.instance_init = dimm_bus_initfn,
+.class_init = dimm_bus_class_init,
+};
+
+void main_memory_bus_create(Object *parent)
+{
+main_memory_bus = g_malloc0(dimm_bus_info.instance_size);
+main_memory_bus-qbus.glib_allocated = true;
+qbus_create_inplace(main_memory_bus-qbus, TYPE_DIMM_BUS, DEVICE(parent),
+membus);
+}
+
+static void dimm_populate(DimmDevice *s)
+{
+DeviceState *dev= (DeviceState*)s;
+MemoryRegion *new = NULL;
+
+new = g_malloc(sizeof(MemoryRegion));
+memory_region_init_ram(new, dev-id, s-size);
+vmstate_register_ram_global(new);
+memory_region_add_subregion(get_system_memory(), s-start, new);
+s-mr = new;
+}
+
+static void dimm_depopulate(DimmDevice *s)
+{
+assert(s);
+vmstate_unregister_ram(s-mr, NULL);
+memory_region_del_subregion(get_system_memory(), s-mr);
+memory_region_destroy(s-mr);
+s-mr = NULL;
+}
+
+void dimm_config_create(char *id, uint64_t size, uint64_t node, uint32_t
+dimm_idx, uint32_t populated)
+{
+DimmConfig *dimm_cfg;
+dimm_cfg = (DimmConfig*) g_malloc0(sizeof(DimmConfig));
+dimm_cfg-name = id;

[RFC PATCH v3 06/19] Implement -dimm command line option

Example:
-dimm id=dimm0,size=512M,node=0,populated=off
will define a 512M memory slot belonging to numa node 0.

When populated=on, a DimmDevice is created and hot-plugged at system startup.

Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com
---
 hw/Makefile.objs |2 +-
 qemu-config.c|   25 +
 qemu-options.hx  |5 +
 sysemu.h |1 +
 vl.c |   50 ++
 5 files changed, 82 insertions(+), 1 deletions(-)

diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index 6dfebd2..8c5c39a 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -26,7 +26,7 @@ hw-obj-$(CONFIG_I8254) += i8254_common.o i8254.o
 hw-obj-$(CONFIG_PCSPK) += pcspk.o
 hw-obj-$(CONFIG_PCKBD) += pckbd.o
 hw-obj-$(CONFIG_FDC) += fdc.o
-hw-obj-$(CONFIG_ACPI) += acpi.o acpi_piix4.o
+hw-obj-$(CONFIG_ACPI) += acpi.o acpi_piix4.o dimm.o
 hw-obj-$(CONFIG_APM) += pm_smbus.o apm.o
 hw-obj-$(CONFIG_DMA) += dma.o
 hw-obj-$(CONFIG_I82374) += i82374.o
diff --git a/qemu-config.c b/qemu-config.c
index eba977e..4022d64 100644
--- a/qemu-config.c
+++ b/qemu-config.c
@@ -646,6 +646,30 @@ QemuOptsList qemu_boot_opts = {
 },
 };
 
+static QemuOptsList qemu_dimm_opts = {
+.name = dimm,
+.head = QTAILQ_HEAD_INITIALIZER(qemu_dimm_opts.head),
+.desc = {
+{
+.name = id,
+.type = QEMU_OPT_STRING,
+.help = id of this dimm device,
+},{
+.name = size,
+.type = QEMU_OPT_SIZE,
+.help = memory size for this dimm,
+},{
+.name = populated,
+.type = QEMU_OPT_BOOL,
+.help = populated for this dimm,
+},{
+.name = node,
+.type = QEMU_OPT_NUMBER,
+.help = NUMA node number (i.e. proximity) for this dimm,
+},
+{ /* end of list */ }
+},
+};
 static QemuOptsList *vm_config_groups[32] = {
 qemu_drive_opts,
 qemu_chardev_opts,
@@ -662,6 +686,7 @@ static QemuOptsList *vm_config_groups[32] = {
 qemu_boot_opts,
 qemu_iscsi_opts,
 qemu_sandbox_opts,
+qemu_dimm_opts,
 NULL,
 };
 
diff --git a/qemu-options.hx b/qemu-options.hx
index 804a2d1..3687722 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -2842,3 +2842,8 @@ HXCOMM This is the last statement. Insert new options 
before this line!
 STEXI
 @end table
 ETEXI
+
+DEF(dimm, HAS_ARG, QEMU_OPTION_dimm,
+-dimm id=dimmid,size=sz,node=nd,populated=on|off\n
+specify memory dimm device with name dimmid, size sz on node nd,
+QEMU_ARCH_ALL)
diff --git a/sysemu.h b/sysemu.h
index 65552ac..7baf9c9 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -139,6 +139,7 @@ extern QEMUClock *rtc_clock;
 extern int nb_numa_nodes;
 extern uint64_t node_mem[MAX_NODES];
 extern unsigned long *node_cpumask[MAX_NODES];
+extern int nb_hp_dimms;
 
 #define MAX_OPTION_ROMS 16
 typedef struct QEMUOptionRom {
diff --git a/vl.c b/vl.c
index 7c577fa..af1745c 100644
--- a/vl.c
+++ b/vl.c
@@ -126,6 +126,7 @@ int main(int argc, char **argv)
 #include hw/xen.h
 #include hw/qdev.h
 #include hw/loader.h
+#include hw/dimm.h
 #include bt-host.h
 #include net.h
 #include net/slirp.h
@@ -248,6 +249,7 @@ QTAILQ_HEAD(, FWBootEntry) fw_boot_order = 
QTAILQ_HEAD_INITIALIZER(fw_boot_order
 int nb_numa_nodes;
 uint64_t node_mem[MAX_NODES];
 unsigned long *node_cpumask[MAX_NODES];
+int nb_hp_dimms;
 
 uint8_t qemu_uuid[16];
 
@@ -530,6 +532,37 @@ static void configure_rtc_date_offset(const char 
*startdate, int legacy)
 }
 }
 
+static void configure_dimm(QemuOpts *opts)
+{
+const char *id;
+uint64_t size, node;
+bool populated;
+QemuOpts *devopts;
+char buf[256];
+if (nb_hp_dimms == MAX_DIMMS) {
+fprintf(stderr, qemu: maximum number of DIMMs (%d) exceeded\n,
+MAX_DIMMS);
+exit(1);
+}
+id = qemu_opts_id(opts);
+size = qemu_opt_get_size(opts, size, DEFAULT_DIMMSIZE);
+populated = qemu_opt_get_bool(opts, populated, 0);
+node = qemu_opt_get_number(opts, node, 0);
+
+dimm_config_create((char*)id, size, node, nb_hp_dimms, 0);
+
+if (populated) {
+devopts = qemu_opts_create(qemu_find_opts(device), id, 0, NULL);
+qemu_opt_set(devopts, driver, dimm);
+snprintf(buf, sizeof(buf), %lu, size);
+qemu_opt_set(devopts, size, buf);
+snprintf(buf, sizeof(buf), %lu, node);
+qemu_opt_set(devopts, node, buf);
+qemu_opt_set(devopts, bus, membus);
+}
+nb_hp_dimms++;
+}
+
 static void configure_rtc(QemuOpts *opts)
 {
 const char *value;
@@ -2354,6 +2387,8 @@ int main(int argc, char **argv, char **envp)
 DisplayChangeListener *dcl;
 int cyls, heads, secs, translation;
 QemuOpts *hda_opts = NULL, *opts, *machine_opts;
+QemuOpts *dimm_opts[MAX_DIMMS];
+int nb_dimm_opts = 0;
 QemuOptsList *olist;
 int optind;
 const char *optarg;
@@ -3288,6 +3323,18 @@ int

[RFC PATCH v3 02/19][SeaBIOS] Add SSDT memory device support

Define SSDT hotplug-able memory devices in _SB namespace. The dynamically
generated SSDT includes per memory device hotplug methods. These methods
just call methods defined in the DSDT. Also dynamically generate a MTFY
method and a MEON array of the online/available memory devices.  ACPI
extraction macros are used to place the AML code in variables later used by
src/acpi. The design is taken from SSDT cpu generation.

Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com
---
 Makefile |2 +-
 src/ssdt-mem.dsl |   65 ++
 2 files changed, 66 insertions(+), 1 deletions(-)
 create mode 100644 src/ssdt-mem.dsl

diff --git a/Makefile b/Makefile
index 5486f88..e82cfc9 100644
--- a/Makefile
+++ b/Makefile
@@ -233,7 +233,7 @@ $(OUT)%.hex: src/%.dsl ./tools/acpi_extract_preprocess.py 
./tools/acpi_extract.p
$(Q)$(PYTHON) ./tools/acpi_extract.py $(OUT)$*.lst  $(OUT)$*.off
$(Q)cat $(OUT)$*.off  $@
 
-$(OUT)ccode32flat.o: $(OUT)acpi-dsdt.hex $(OUT)ssdt-proc.hex 
$(OUT)ssdt-pcihp.hex $(OUT)ssdt-susp.hex
+$(OUT)ccode32flat.o: $(OUT)acpi-dsdt.hex $(OUT)ssdt-proc.hex 
$(OUT)ssdt-pcihp.hex $(OUT)ssdt-susp.hex $(OUT)ssdt-mem.hex
 
  Kconfig rules
 
diff --git a/src/ssdt-mem.dsl b/src/ssdt-mem.dsl
new file mode 100644
index 000..ee322f0
--- /dev/null
+++ b/src/ssdt-mem.dsl
@@ -0,0 +1,65 @@
+/* This file is the basis for the ssdt_mem[] variable in src/acpi.c.
+ * It is similar in design to the ssdt_proc variable.
+ * It defines the contents of the per-cpu Processor() object.  At
+ * runtime, a dynamically generated SSDT will contain one copy of this
+ * AML snippet for every possible memory device in the system.  The
+ * objects will * be placed in the \_SB_ namespace.
+ *
+ * In addition to the aml code generated from this file, the
+ * src/acpi.c file creates a MEMNTFY method with an entry for each memdevice:
+ * Method(MTFY, 2) {
+ * If (LEqual(Arg0, 0x00)) { Notify(MP00, Arg1) }
+ * If (LEqual(Arg0, 0x01)) { Notify(MP01, Arg1) }
+ * ...
+ * }
+ * and a MEON array with the list of active and inactive memory devices:
+ * Name(MEON, Package() { One, One, ..., Zero, Zero, ... })
+ */
+ACPI_EXTRACT_ALL_CODE ssdm_mem_aml
+
+DefinitionBlock (ssdt-mem.aml, SSDT, 0x02, BXPC, CSSDT, 0x1)
+/*  v-- DO NOT EDIT --v */
+{
+ACPI_EXTRACT_DEVICE_START ssdt_mem_start
+ACPI_EXTRACT_DEVICE_END ssdt_mem_end
+ACPI_EXTRACT_DEVICE_STRING ssdt_mem_name
+Device(MPAA) {
+ACPI_EXTRACT_NAME_BYTE_CONST ssdt_mem_id
+Name(ID, 0xAA)
+/*  ^-- DO NOT EDIT --^
+ *
+ * The src/acpi.c code requires the above layout so that it can update
+ * MPAA and 0xAA with the appropriate MEMDEVICE id (see
+ * SD_OFFSET_MEMHEX/MEMID1/MEMID2).  Don't change the above without
+ * also updating the C code.
+ */
+Name(_HID, EISAID(PNP0C80))
+Name(_PXM, 0xAA)
+
+External(CMST, MethodObj)
+External(MPEJ, MethodObj)
+
+Name(_CRS, ResourceTemplate() {
+QwordMemory(
+   ResourceConsumer,
+   ,
+   MinFixed,
+   MaxFixed,
+   Cacheable,
+   ReadWrite,
+   0x0,
+   0xDEADBEEF,
+   0xE6ADBEEE,
+   0x,
+   0x0800,
+   )
+})
+Method (_STA, 0) {
+Return(CMST(ID))
+}
+Method (_EJ0, 1, NotSerialized) {
+MPEJ(ID, Arg0)
+}
+}
+}
+
-- 
1.7.9

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH v3 19/19] alternative: Introduce paravirt interface QEMU_CFG_PCI_WINDOW

Qemu already calculates the 32-bit and 64-bit PCI starting offsets based on
initial memory and hotplug-able dimms. This info needs to be passed to Seabios
for PCI initialization.

Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com
---
 docs/specs/fwcfg.txt |9 +
 hw/fw_cfg.h  |1 +
 hw/pc_piix.c |   10 ++
 3 files changed, 20 insertions(+), 0 deletions(-)

diff --git a/docs/specs/fwcfg.txt b/docs/specs/fwcfg.txt
index 55f96d9..d9fa215 100644
--- a/docs/specs/fwcfg.txt
+++ b/docs/specs/fwcfg.txt
@@ -26,3 +26,12 @@ Entry max_cpus+nb_numa_nodes+1 contains the number of memory 
dimms (nb_hp_dimms)
 The last 3 * nb_hp_dimms entries are organized in triplets: Each triplet 
contains
 the physical address offset, size (in bytes), and node proximity for the
 respective dimm.
+
+FW_CFG_PCI_WINDOW paravirt info
+
+QEMU passes the starting address for the 32-bit and 64-bit PCI windows to BIOS.
+The following layouts are followed:
+
+
+pcimem32_start | pcimem64_start | 
+
diff --git a/hw/fw_cfg.h b/hw/fw_cfg.h
index 856bf91..6c8c151 100644
--- a/hw/fw_cfg.h
+++ b/hw/fw_cfg.h
@@ -27,6 +27,7 @@
 #define FW_CFG_SETUP_SIZE   0x17
 #define FW_CFG_SETUP_DATA   0x18
 #define FW_CFG_FILE_DIR 0x19
+#define FW_CFG_PCI_WINDOW   0x1a
 
 #define FW_CFG_FILE_FIRST   0x20
 #define FW_CFG_FILE_SLOTS   0x10
diff --git a/hw/pc_piix.c b/hw/pc_piix.c
index d1fd276..034761f 100644
--- a/hw/pc_piix.c
+++ b/hw/pc_piix.c
@@ -44,6 +44,7 @@
 #include memory.h
 #include exec-memory.h
 #include dimm.h
+#include fw_cfg.h
 #ifdef CONFIG_XEN
 #  include xen/hvm/hvm_info_table.h
 #endif
@@ -149,6 +150,7 @@ static void pc_init1(MemoryRegion *system_memory,
 MemoryRegion *pci_memory;
 MemoryRegion *rom_memory;
 void *fw_cfg = NULL;
+uint64_t *pci_window_fw_cfg;
 
 pc_cpus_init(cpu_model);
 
@@ -205,6 +207,14 @@ static void pc_init1(MemoryRegion *system_memory,
? 0
: ((uint64_t)1  62)),
   pci_memory, ram_memory);
+
+pci_window_fw_cfg = g_malloc0(2 * 8);
+pci_window_fw_cfg[0] = cpu_to_le64(below_4g_mem_size +
+below_4g_hp_mem_size);
+pci_window_fw_cfg[1] = cpu_to_le64(0x1ULL + above_4g_mem_size
++ above_4g_hp_mem_size);
+fw_cfg_add_bytes(fw_cfg, FW_CFG_PCI_WINDOW, 
+(uint8_t *)pci_window_fw_cfg, 2 * 8);
 } else {
 pci_bus = NULL;
 i440fx_state = NULL;
-- 
1.7.9

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH v3 20/19][SeaBIOS] alternative: Use paravirt interface for pci windows

Initialize the 32-bit and 64-bit pci starting offsets from values passed in by
the qemu paravirt interface QEMU_CFG_PCI_WINDOW. Qemu calculates the starting
offsets based on initial memory and hotplug-able dimms.

Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com
---
 src/paravirt.c |6 ++
 src/paravirt.h |2 ++
 src/pciinit.c  |5 ++---
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/paravirt.c b/src/paravirt.c
index 2a98d53..390ef30 100644
--- a/src/paravirt.c
+++ b/src/paravirt.c
@@ -346,3 +346,9 @@ void qemu_cfg_romfile_setup(void)
 dprintf(3, Found fw_cfg file: %s (size=%d)\n, file-name, 
file-size);
 }
 }
+
+void qemu_cfg_get_pci_offsets(u64 *pcimem_start, u64 *pcimem64_start)
+{
+qemu_cfg_read_entry(pcimem_start, QEMU_CFG_PCI_WINDOW, sizeof(u64));
+qemu_cfg_read((u8*)(pcimem64_start), sizeof(u64));
+}
diff --git a/src/paravirt.h b/src/paravirt.h
index a284c41..b53ff88 100644
--- a/src/paravirt.h
+++ b/src/paravirt.h
@@ -35,6 +35,7 @@ static inline int kvm_para_available(void)
 #define QEMU_CFG_BOOT_MENU  0x0e
 #define QEMU_CFG_MAX_CPUS   0x0f
 #define QEMU_CFG_FILE_DIR   0x19
+#define QEMU_CFG_PCI_WINDOW 0x1a
 #define QEMU_CFG_ARCH_LOCAL 0x8000
 #define QEMU_CFG_ACPI_TABLES(QEMU_CFG_ARCH_LOCAL + 0)
 #define QEMU_CFG_SMBIOS_ENTRIES (QEMU_CFG_ARCH_LOCAL + 1)
@@ -65,5 +66,6 @@ struct e820_reservation {
 u32 qemu_cfg_e820_entries(void);
 void* qemu_cfg_e820_load_next(void *addr);
 void qemu_cfg_romfile_setup(void);
+void qemu_cfg_get_pci_offsets(u64 *pcimem_start, u64 *pcimem64_start);
 
 #endif
diff --git a/src/pciinit.c b/src/pciinit.c
index 68f302a..64468a0 100644
--- a/src/pciinit.c
+++ b/src/pciinit.c
@@ -592,8 +592,7 @@ static void pci_region_map_entries(struct pci_bus *busses, 
struct pci_region *r)
 
 static void pci_bios_map_devices(struct pci_bus *busses)
 {
-pcimem_start = RamSize;
-
+qemu_cfg_get_pci_offsets(pcimem_start, pcimem64_start);
 if (pci_bios_init_root_regions(busses)) {
 struct pci_region r64_mem, r64_pref;
 r64_mem.list = NULL;
@@ -611,7 +610,7 @@ static void pci_bios_map_devices(struct pci_bus *busses)
 u64 align_mem = pci_region_align(r64_mem);
 u64 align_pref = pci_region_align(r64_pref);
 
-r64_mem.base = ALIGN(0x1LL + RamSizeOver4G, align_mem);
+r64_mem.base = ALIGN(pcimem64_start, align_mem);
 r64_pref.base = ALIGN(r64_mem.base + sum_mem, align_pref);
 pcimem64_start = r64_mem.base;
 pcimem64_end = r64_pref.base + sum_pref;
-- 
1.7.9

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC 0/2] kvm: Improving undercommit,overcommit scenarios in PLE handler

In some special scenarios like #vcpu = #pcpu, PLE handler may 
prove very costly, because there is no need to iterate over vcpus
and do unsuccessful yield_to burning CPU.

An idea to solve this is:
1) As Avi had proposed we can modify hardware ple_window
dynamically to avoid frequent PL-exit. (IMHO, it is difficult to
decide when we have mixed type of VMs).

Another idea, proposed in the first patch, is to identify
non-overcommit case and just return from the PLE handler.

There are are many ways to identify non-overcommit scenario.
1) Using loadavg etc (get_avenrun/calc_global_load
 /this_cpu_load)

2) Explicitly check nr_running()/num_online_cpus()

3) Check source vcpu runqueue length.

Not sure how can we make use of (1) effectively/how to use it.
(2) has significant overhead since it iterates all cpus.
so this patch uses third method. (I feel it is uglier to export
runqueue length, but expecting suggestion on this).

In second patch, when we have large number of small guests, it is
possible that a spinning vcpu fails to yield_to any vcpu of same 
VM and go back and spin. This is also not effective when we are
over-committed. Instead, we do a schedule() so that we give chance
to other VMs to run.

Raghavendra K T(2):
 Handle undercommitted guest case in PLE handler
 Be courteous to other VMs in overcommitted scenario in PLE handler 

Results:
base = 3.6.0-rc5 + ple handler optimization patches from kvm tree.
patched = base + patch1 + patch2
machine: x240 with 16 core with HT enabled (32 cpu thread).
32 vcpu guest with 8GB RAM.

+---+---+---++---+
 ebizzy (record/sec higher is better)
+---+---+---++---+
   basestddev   patchedstdev%improve 
+---+---+---++---+
 11293.3750   624.4378   18209.6250   371.706161.24166
  3641.8750   468.94003725.5000   253.7823 2.29621
+---+---+---++---+

+---+---+---++---+
kernbench (time in sec lower is better)
+---+---+---++---+
   basestddev   patchedstdev%improve 
+---+---+---++---+
30.6020 1.3018  30.8287 1.1517-0.74080
64.0825 2.3764  63.4721 5.0191 0.95252
95.8638 8.7030  94.5988 8.3832 1.31958
+---+---+---++---+

Note:
on mx3850x5 machine with 32 cores HT disabled I got around
ebizzy  209%
kernbench   6%
improvement for 1x scenario.

Thanks Srikar for his active partipation in discussing ideas and
reviewing the patch.

Please let me know your suggestions and comments.
---
 include/linux/sched.h |1 +
 kernel/sched/core.c   |6 ++
 virt/kvm/kvm_main.c   |7 +++
 3 files changed, 14 insertions(+), 0 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC 1/2] kvm: Handle undercommitted guest case in PLE handler

From: Raghavendra K T raghavendra...@linux.vnet.ibm.com

When total number of VCPUs of system is less than or equal to physical CPUs,
PLE exits become costly since each VCPU can have dedicated PCPU, and
trying to find a target VCPU to yield_to just burns time in PLE handler.

This patch reduces overhead, by simply doing a return in such scenarios by
checking the length of current cpu runqueue.

Reviewed-by: Srikar Dronamraju sri...@linux.vnet.ibm.com
Signed-off-by: Raghavendra K T raghavendra...@linux.vnet.ibm.com
---
 include/linux/sched.h |1 +
 kernel/sched/core.c   |6 ++
 virt/kvm/kvm_main.c   |3 +++
 3 files changed, 10 insertions(+), 0 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b8c8664..3645458 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -138,6 +138,7 @@ extern int nr_threads;
 DECLARE_PER_CPU(unsigned long, process_counts);
 extern int nr_processes(void);
 extern unsigned long nr_running(void);
+extern unsigned long rq_nr_running(void);
 extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_iowait(void);
 extern unsigned long nr_iowait_cpu(int cpu);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fbf1fd0..2170b81 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4820,6 +4820,12 @@ void __sched yield(void)
 }
 EXPORT_SYMBOL(yield);
 
+unsigned long rq_nr_running(void)
+{
+   return this_rq()-nr_running;
+}
+EXPORT_SYMBOL(rq_nr_running);
+
 /**
  * yield_to - yield the current processor to another thread in
  * your thread group, or accelerate that thread toward the
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 28f00bc..8323685 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1629,6 +1629,9 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
int pass;
int i;
 
+   if (unlikely(rq_nr_running() == 1))
+   return;
+
kvm_vcpu_set_in_spin_loop(me, true);
/*
 * We boost the priority of a VCPU that is runnable but not

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC 2/2] kvm: Be courteous to other VMs in overcommitted scenario in PLE handler

From: Raghavendra K T raghavendra...@linux.vnet.ibm.com

When PLE handler fails to find a better candidate to yield_to, it
goes back and does spin again. This is acceptable when we do not
have overcommit.
But in overcommitted scenarios (especially when we have large
number of small guests), it is better to yield.

Reviewed-by: Srikar Dronamraju sri...@linux.vnet.ibm.com
Signed-off-by: Raghavendra K T raghavendra...@linux.vnet.ibm.com
---
 virt/kvm/kvm_main.c |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8323685..713b677 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1660,6 +1660,10 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
}
}
}
+   /* In overcommitted cases, yield instead of spinning */
+   if (!yielded  rq_nr_running()  1)
+   schedule();
+
kvm_vcpu_set_in_spin_loop(me, false);
 
/* Ensure vcpu is not eligible during next spinloop */

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [KVM] Guest Debugging Facilities in KVM

2012-09-21 Thread Jan Kiszka

On 2012-09-20 19:17, Dean Pucsek wrote:
 
 On 2012-09-19, at 7:45 AM, Jan Kiszka jan.kis...@siemens.com wrote:
 
 On 2012-09-19 16:38, Avi Kivity wrote:
 On 09/17/2012 10:36 PM, Dean Pucsek wrote:
 Hello,

 For my Masters thesis I am investigating the usage of Intel VT-x and 
 branch tracing in the domain of malware analysis.  Essentially what I'm 
 aiming to do is trace the execution of a guest VM and then pass that trace 
 on to some other tools.  I've been playing KVM for a couple weeks now but 
 from comments such as (in arch/x86/kvm/vmx.c): 

   /*
* Forward all other exceptions that are valid in real mode.
* FIXME: Breaks guest debugging in real mode, needs to be fixed with
*the required debugging infrastructure rework.
*/

 And (from an email sent to the list in July 2008): 

Note that guest debugging in real mode is broken now. This has to be
fixed by the scheduled debugging infrastructure rework (will be done
once base patches for QEMU have been accepted).

 it is unclear to me how much support there is for guest debugging in KVM 
 currently (I wasn't able to find any recent documentation on it) and what 
 the debugging infrastructure referred to by these comments is.  I am 
 interested in becoming involved with the KVM project in this respect 
 however some guidance and direction on the guest debugging facilities 
 would be greatly appreciated.

 Guest debugging works (but not in real mode due to the issue above).

 That doesn't apply to CPUs with Unrestricted Guest support, right? At
 least I didn't notice any limitations recently. [I did notice some other
 corner-case issue with guest debugging, still need to dig into that...]

 You can set hardware and software breakpoints and kvm will forward them
 to userspace, and from there to the debugger.  I'll be happy to help, as
 I'm sure Jan (as the author of most of the guest debugging code) will as
 well.

 
 Is there a roadmap or plan for how the KVM project envisions the debugging 
 facilities evolving? 

KVM and QEMU are in a pretty good shape now for kernel debugging on x86
- given current boundary conditions. Still we need to do something
because gdb for x86 is not well prepared for system-level debugging. And
those changes will requires some extensions of QEMU in turn.

There are some ideas and early code to add gdb tracepoint support to
QEMU and, possibly, to KVM (as acceleration).

Jan

-- 
Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
Corporate Competence Center Embedded Linux
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 02/17] target-i386: Add missing kvm bits.

2012-09-21 Thread Eduardo Habkost

On Fri, Sep 21, 2012 at 10:39:52AM +0200, Igor Mammedov wrote:
 On Thu, 20 Sep 2012 16:03:17 -0400
 Don Slutz d...@cloudswitch.com wrote:
 
  Fix duplicate name (kvmclock = kvm_clock2) also.
  
  Signed-off-by: Don Slutz d...@cloudswitch.com
  ---
   target-i386/cpu.c |   12 
   1 files changed, 8 insertions(+), 4 deletions(-)
  
  diff --git a/target-i386/cpu.c b/target-i386/cpu.c
  index 0313cf5..5f9866a 100644
  --- a/target-i386/cpu.c
  +++ b/target-i386/cpu.c
  @@ -87,10 +87,14 @@ static const char *ext3_feature_name[] = {
   };
   
   static const char *kvm_feature_name[] = {
  -kvmclock, kvm_nopiodelay, kvm_mmu, kvmclock, kvm_asyncpf, 
  NULL, kvm_pv_eoi, NULL,
  -NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  -NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  -NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  +kvmclock, kvm_nopiodelay, kvm_mmu, kvm_clock2,
 before patch if kvmclock is specified it would set 0 and 3 bits,
 after patch only bit 0 is set.
 Is it correct/expected behavior? if yes, please add rationale into patch
 description.

The problem here seems to be:
- It would be interesting to make kvmclock=true enough to enable the
  optimal behavior, instead of requiring users to use kvm_clock2=true
  explicitly
- We need to allow older machine-types to be backwards compatible (not
  enabling the second bit by default), so we need a separate property
  to control the second bit.

I think this is best modelled this way:

- Having two separate properties: kvmclock and kvmclock2 (or kvm_clock2)
- Older machine-types would have kvmclock2 default to false. Newer
  machine-types would kvmclock2 default to true.
- kvmclock=false would disable both bits

Then:

 - kvmclock=false would not set any bit (it would be surprising to have
   kvmclock=false but still have kvmclock enabled)
 - kvmclock=true would keep compatible behavior on older machine-types,
   (only the first bit set), but would get optimal behavior on newer
   machine-types (both bits set)
 - kvmclock=true,kvmclock2=true would set both bits
 - kvmclock=true,kvmclock2=false would set only the first bit

It wouldn't be a direct mapping between properties and CPUID bits, but
that's exactly the point. In this case, exposing individual CPUID bits
directly is a too low-level interface.


  
  +kvm_asyncpf, kvm_steal_time, kvm_pv_eoi, NULL,
  +NULL, NULL, NULL, NULL,
  +NULL, NULL, NULL, NULL,
  +NULL, NULL, NULL, NULL,
  +NULL, NULL, NULL, NULL,
  +kvm_clock_stable, NULL, NULL, NULL,
  +NULL, NULL, NULL, NULL,
   };
   
   static const char *svm_feature_name[] = {
  -- 
  1.7.1
  
 
 
 -- 
 Regards,
   Igor

-- 
Eduardo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC 1/2] kvm: Handle undercommitted guest case in PLE handler


On 09/21/2012 08:00 AM, Raghavendra K T wrote:

From: Raghavendra K T raghavendra...@linux.vnet.ibm.com

When total number of VCPUs of system is less than or equal to physical CPUs,
PLE exits become costly since each VCPU can have dedicated PCPU, and
trying to find a target VCPU to yield_to just burns time in PLE handler.

This patch reduces overhead, by simply doing a return in such scenarios by
checking the length of current cpu runqueue.


I am not convinced this is the way to go.

The VCPU that is holding the lock, and is not releasing it,
probably got scheduled out. That implies that VCPU is on a
runqueue with at least one other task.


--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1629,6 +1629,9 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
int pass;
int i;

+   if (unlikely(rq_nr_running() == 1))
+   return;
+
kvm_vcpu_set_in_spin_loop(me, true);
/*
 * We boost the priority of a VCPU that is runnable but not




--
All rights reversed
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 02/17] target-i386: Add missing kvm bits.


On 09/21/12 08:36, Eduardo Habkost wrote:

On Fri, Sep 21, 2012 at 10:39:52AM +0200, Igor Mammedov wrote:

On Thu, 20 Sep 2012 16:03:17 -0400
Don Slutz d...@cloudswitch.com wrote:


Fix duplicate name (kvmclock = kvm_clock2) also.

Signed-off-by: Don Slutz d...@cloudswitch.com
---
  target-i386/cpu.c |   12 
  1 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index 0313cf5..5f9866a 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -87,10 +87,14 @@ static const char *ext3_feature_name[] = {
  };
  
  static const char *kvm_feature_name[] = {

-kvmclock, kvm_nopiodelay, kvm_mmu, kvmclock, kvm_asyncpf, NULL, 
kvm_pv_eoi, NULL,
-NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+kvmclock, kvm_nopiodelay, kvm_mmu, kvm_clock2,

before patch if kvmclock is specified it would set 0 and 3 bits,
after patch only bit 0 is set.
Is it correct/expected behavior? if yes, please add rationale into patch
description.

This is not what I had intended.

The problem here seems to be:
- It would be interesting to make kvmclock=true enough to enable the
   optimal behavior, instead of requiring users to use kvm_clock2=true
   explicitly
- We need to allow older machine-types to be backwards compatible (not
   enabling the second bit by default), so we need a separate property
   to control the second bit.

I think this is best modelled this way:

- Having two separate properties: kvmclock and kvmclock2 (or kvm_clock2)
- Older machine-types would have kvmclock2 default to false. Newer
   machine-types would kvmclock2 default to true.
- kvmclock=false would disable both bits

Then:

  - kvmclock=false would not set any bit (it would be surprising to have
kvmclock=false but still have kvmclock enabled)
  - kvmclock=true would keep compatible behavior on older machine-types,
(only the first bit set), but would get optimal behavior on newer
machine-types (both bits set)
  - kvmclock=true,kvmclock2=true would set both bits
  - kvmclock=true,kvmclock2=false would set only the first bit

It wouldn't be a direct mapping between properties and CPUID bits, but
that's exactly the point. In this case, exposing individual CPUID bits
directly is a too low-level interface.

This does look much better.  For the sake of simple changes, this patch 
will be changed so that -kvmclock (kvmclock=false) will continue to 
clear both bits.  I will look into the right way to fit this into the 
newer cpu model.
  

+kvm_asyncpf, kvm_steal_time, kvm_pv_eoi, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+kvm_clock_stable, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
  };
  
  static const char *svm_feature_name[] = {

--
1.7.1



--
Regards,
   Igor

  -Don Slutz
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC 0/2] kvm: Improving undercommit,overcommit scenarios in PLE handler

2012-09-21 Thread Chegu Vinod


On 9/21/2012 4:59 AM, Raghavendra K T wrote:

In some special scenarios like #vcpu = #pcpu, PLE handler may
prove very costly,


Yes.

  because there is no need to iterate over vcpus
and do unsuccessful yield_to burning CPU.

An idea to solve this is:
1) As Avi had proposed we can modify hardware ple_window
dynamically to avoid frequent PL-exit.


Yes. We had to do this to get around some scaling issues for large 
(20way) guests (with no overcommitment)


As part of some experimentation we even tried switching off  PLE too :(




(IMHO, it is difficult to
decide when we have mixed type of VMs).


Agree.

Not sure if the following alternatives have also been looked at :

- Could the  behavior  associated with the ple_window be modified to 
be a function of some [new] per-guest attribute (which can be conveyed 
to the host as part of the guest launch sequence). The user can choose 
to set this [new] attribute for a given guest. This would help avoid the 
frequent exits due to PLE (as Avi had mentioned earlier) ?


- Can the PLE feature ( in VT) be enhanced to be made a per guest 
attribute ?



IMHO, the approach of not taking a frequent exit is better than taking 
an exit and returning back from the handler etc.


Thanks
Vinod






Another idea, proposed in the first patch, is to identify
non-overcommit case and just return from the PLE handler.

There are are many ways to identify non-overcommit scenario.
1) Using loadavg etc (get_avenrun/calc_global_load
  /this_cpu_load)

2) Explicitly check nr_running()/num_online_cpus()

3) Check source vcpu runqueue length.

Not sure how can we make use of (1) effectively/how to use it.
(2) has significant overhead since it iterates all cpus.
so this patch uses third method. (I feel it is uglier to export
runqueue length, but expecting suggestion on this).

In second patch, when we have large number of small guests, it is
possible that a spinning vcpu fails to yield_to any vcpu of same
VM and go back and spin. This is also not effective when we are
over-committed. Instead, we do a schedule() so that we give chance
to other VMs to run.

Raghavendra K T(2):
  Handle undercommitted guest case in PLE handler
  Be courteous to other VMs in overcommitted scenario in PLE handler

Results:
base = 3.6.0-rc5 + ple handler optimization patches from kvm tree.
patched = base + patch1 + patch2
machine: x240 with 16 core with HT enabled (32 cpu thread).
32 vcpu guest with 8GB RAM.

+---+---+---++---+
  ebizzy (record/sec higher is better)
+---+---+---++---+
basestddev   patchedstdev%improve
+---+---+---++---+
  11293.3750   624.4378  18209.6250   371.706161.24166
   3641.8750   468.9400   3725.5000   253.7823 2.29621
+---+---+---++---+

+---+---+---++---+
 kernbench (time in sec lower is better)
+---+---+---++---+
basestddev   patchedstdev%improve
+---+---+---++---+
 30.6020 1.3018 30.8287 1.1517-0.74080
 64.0825 2.3764 63.4721 5.0191 0.95252
 95.8638 8.7030 94.5988 8.3832 1.31958
+---+---+---++---+

Note:
on mx3850x5 machine with 32 cores HT disabled I got around
ebizzy  209%
kernbench   6%
improvement for 1x scenario.

Thanks Srikar for his active partipation in discussing ideas and
reviewing the patch.

Please let me know your suggestions and comments.
---
  include/linux/sched.h |1 +
  kernel/sched/core.c   |6 ++
  virt/kvm/kvm_main.c   |7 +++
  3 files changed, 14 insertions(+), 0 deletions(-)

.



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC 2/2] kvm: Be courteous to other VMs in overcommitted scenario in PLE handler


On 09/21/2012 08:00 AM, Raghavendra K T wrote:

From: Raghavendra K T raghavendra...@linux.vnet.ibm.com

When PLE handler fails to find a better candidate to yield_to, it
goes back and does spin again. This is acceptable when we do not
have overcommit.
But in overcommitted scenarios (especially when we have large
number of small guests), it is better to yield.

Reviewed-by: Srikar Dronamraju sri...@linux.vnet.ibm.com
Signed-off-by: Raghavendra K T raghavendra...@linux.vnet.ibm.com


Acked-by: Rik van Riel r...@redhat.com

--
All rights reversed
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC 2/2] kvm: Be courteous to other VMs in overcommitted scenario in PLE handler

2012-09-21 Thread Takuya Yoshikawa

On Fri, 21 Sep 2012 17:30:20 +0530
Raghavendra K T raghavendra...@linux.vnet.ibm.com wrote:

 From: Raghavendra K T raghavendra...@linux.vnet.ibm.com
 
 When PLE handler fails to find a better candidate to yield_to, it
 goes back and does spin again. This is acceptable when we do not
 have overcommit.
 But in overcommitted scenarios (especially when we have large
 number of small guests), it is better to yield.
 
 Reviewed-by: Srikar Dronamraju sri...@linux.vnet.ibm.com
 Signed-off-by: Raghavendra K T raghavendra...@linux.vnet.ibm.com
 ---
  virt/kvm/kvm_main.c |4 
  1 files changed, 4 insertions(+), 0 deletions(-)
 
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 index 8323685..713b677 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -1660,6 +1660,10 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
   }
   }
   }
 + /* In overcommitted cases, yield instead of spinning */
 + if (!yielded  rq_nr_running()  1)
 + schedule();

How about doing cond_resched() instead?

I'm not sure whether checking more sched stuff in KVM code is a
good thing.

Takuya
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 0/9] Reduce compaction scanning and lock contention


On 09/21/2012 06:46 AM, Mel Gorman wrote:

Hi Andrew,

Richard Davies and Shaohua Li have both reported lock contention
problems in compaction on the zone and LRU locks as well as
significant amounts of time being spent in compaction. This series
aims to reduce lock contention and scanning rates to reduce that CPU
usage. Richard reported at https://lkml.org/lkml/2012/9/21/91 that
this series made a big different to a problem he reported in August
(http://marc.info/?l=kvmm=134511507015614w=2).



One way or the other, this series has a large impact on the amount of
scanning compaction does when there is a storm of THP allocations.


Andrew,

Mel and I have discussed the stuff in this series quite a bit,
and I am convinced this is the way forward with compaction.

--
All rights reversed
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC 2/2] kvm: Be courteous to other VMs in overcommitted scenario in PLE handler


On 09/21/2012 09:46 AM, Takuya Yoshikawa wrote:

On Fri, 21 Sep 2012 17:30:20 +0530
Raghavendra K T raghavendra...@linux.vnet.ibm.com wrote:


From: Raghavendra K T raghavendra...@linux.vnet.ibm.com

When PLE handler fails to find a better candidate to yield_to, it
goes back and does spin again. This is acceptable when we do not
have overcommit.
But in overcommitted scenarios (especially when we have large
number of small guests), it is better to yield.

Reviewed-by: Srikar Dronamraju sri...@linux.vnet.ibm.com
Signed-off-by: Raghavendra K T raghavendra...@linux.vnet.ibm.com
---
  virt/kvm/kvm_main.c |4 
  1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8323685..713b677 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1660,6 +1660,10 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
}
}
}
+   /* In overcommitted cases, yield instead of spinning */
+   if (!yielded  rq_nr_running()  1)
+   schedule();


How about doing cond_resched() instead?


Actually, an actual call to yield() may be better.

That will set scheduler hints to make the scheduler pick
another task for one round, while preserving this task's
top position in the runqueue.

--
All rights reversed
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 1/1] target-i386: Fix default Hypervisor level for hypervisor-vendor=kvm.

2012-09-21 Thread Eduardo Habkost

On Thu, Sep 20, 2012 at 04:06:27PM -0400, Don Slutz wrote:
 From http://lkml.indiana.edu/hypermail/linux/kernel/1205.0/00100.html
 EAX should be KVM_CPUID_FEATURES (0x4001) not 0.
 
 Added hypervisor-vendor=kvm0 to get the older CPUID result. kvm1 selects the 
 newer one.

Why not just make hypervisor-vendor=kvm control only the hypervisor
vendor string, and support something like kvm-hypervisor-level=0 to
restore the old cpuid_hv_level=0 behavior?

This is similar to the kvmclock case: it would allow us to make
hypervisor-vendor=kvm use saner values as default, but letting old
machine-types to override it for compatibility if required.

 
 Signed-off-by: Don Slutz d...@cloudswitch.com
 ---
  target-i386/cpu.c |   17 +
  1 files changed, 13 insertions(+), 4 deletions(-)
 
 diff --git a/target-i386/cpu.c b/target-i386/cpu.c
 index 72a8442..e51b2b1 100644
 --- a/target-i386/cpu.c
 +++ b/target-i386/cpu.c
 @@ -1250,9 +1250,12 @@ static char *x86_cpuid_get_hv_vendor(Object *obj, 
 Error **errp)
  } else if (!strcmp(value, CPUID_HV_VENDOR_XEN) 
 env-cpuid_hv_level == CPUID_HV_LEVEL_XEN) {
  pstrcpy(value, sizeof(value), xen);
 -} else if (!strcmp(value, CPUID_HV_VENDOR_KVM) 
 -   env-cpuid_hv_level == 0) {
 -pstrcpy(value, sizeof(value), kvm);
 +} else if (!strcmp(value, CPUID_HV_VENDOR_KVM)) {
 +if (env-cpuid_hv_level == CPUID_HV_LEVEL_KVM) {
 +pstrcpy(value, sizeof(value), kvm1);
 +} else if (env-cpuid_hv_level == 0) {
 +pstrcpy(value, sizeof(value), kvm0);
 +}
  }
  return value;
  }
 @@ -1288,7 +1291,13 @@ static void x86_cpuid_set_hv_vendor(Object *obj, const 
 char *value,
  env-cpuid_hv_level = CPUID_HV_LEVEL_XEN;
  }
  pstrcpy(adj_value, sizeof(adj_value), CPUID_HV_VENDOR_XEN);
 -} else if (!strcmp(value, kvm)) {
 +} else if (!strcmp(value, kvm) || !strcmp(value, kvm1)) {
 +if (env-cpuid_hv_level == 0) {
 +env-cpuid_hv_level = CPUID_HV_LEVEL_KVM;
 +}
 +pstrcpy(adj_value, sizeof(adj_value), CPUID_HV_VENDOR_KVM);
 +} else if (!strcmp(value, kvm0)) {
 +env-cpuid_hv_level = 0;
  pstrcpy(adj_value, sizeof(adj_value), CPUID_HV_VENDOR_KVM);
  } else {
  pstrcpy(adj_value, sizeof(adj_value), value);
 -- 
 1.7.1
 

-- 
Eduardo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: pci-assign terminates the guest upon pread() / pwrite() error?

2012-09-21 Thread Etienne Martineau


On 09/20/2012 05:13 PM, Alex Williamson wrote:

On Thu, 2012-09-20 at 16:36 -0400, Etienne Martineau wrote:

On 09/20/2012 03:37 PM, Alex Williamson wrote:

On Thu, 2012-09-20 at 15:08 -0400, Etienne Martineau wrote:

On 09/20/2012 02:16 PM, Alex Williamson wrote:

On Thu, 2012-09-20 at 13:27 -0400, Etienne Martineau wrote:

In hw/kvm/pci-assign.c a pread() error part of assigned_dev_pci_read()
result in a hw_error(). Similarly a pwrite() error part of
assigned_dev_pci_write() also result in a hw_error().

Would there be a way to avoid terminating the guest for those cases? How
about we deassign the device upon error?


By terminating the guest we contain the error vs allowing the guest to
continue running with invalid data.  De-assigning the device is
asynchronous and relies on guest involvement, so damage is potentially
already done.  Is this a theoretical problem or do you actually have
hardware that hits this?  Thanks,

Alex



This problem is in the context of a Hot-pluggable device assigned to the
guest. If the guest rd/wr the config space at the same time than the
device is physically taken out then the guest will terminate with
hw_error().

Because this limits the availability of the guest I think we should try
to recover instead. I don't see what other damage can happen since
guest's MMIO access to the stale device will go nowhere?


So you're looking at implementing surprise device removal?  There's not
just config space, there's slow bar access and mmap'd spaces to worry
about too.  What does going nowhere mean?  If it means reads return -1
and the guest is trying to read the data portion of a packet from the
network or an hba, we've now passed bad data to the guest.  Thanks,

Alex





Thanks for your answer;

Yes we are doing 'surprise device removal' for assigned device. Note
that the problem also exist with standard 'attention button' device removal.

The problem is all about fault isolation. Ideally, only the
corresponding driver should be affected by this 'surprise device
removal'. I think that taking down the guest is too coarse. Think about
a 'surprise device removal' on the host. In that case the host is not
taken down so why not do the same with the guest?


It depends on the host hardware.  Some x86 hardware will try to isolate
the fault with an NMI other architectures such as ia64 would pull a
machine check on a driver access to unresponsive devices.


Yes some badness will be latched into the guest but really this not any
different that having a mis-behaving device.


... which is a bad thing, but often undetectable.  This is detectable.
Thanks,

Alex



Our hardware is throwing a surprise link down PCIe AER and we are acting 
on it. I agree that for the generalized case NMI can be an issue.


Let me ask you that question. What would be the best way to support 
device removal (surprise or not) for guest assigned device then? How 
about signaling the guest from vfio_pci_remove()?


thanks,
Etienne

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: pci-assign terminates the guest upon pread() / pwrite() error?

On Fri, 2012-09-21 at 11:17 -0400, Etienne Martineau wrote:
 On 09/20/2012 05:13 PM, Alex Williamson wrote:
  On Thu, 2012-09-20 at 16:36 -0400, Etienne Martineau wrote:
  On 09/20/2012 03:37 PM, Alex Williamson wrote:
  On Thu, 2012-09-20 at 15:08 -0400, Etienne Martineau wrote:
  On 09/20/2012 02:16 PM, Alex Williamson wrote:
  On Thu, 2012-09-20 at 13:27 -0400, Etienne Martineau wrote:
  In hw/kvm/pci-assign.c a pread() error part of assigned_dev_pci_read()
  result in a hw_error(). Similarly a pwrite() error part of
  assigned_dev_pci_write() also result in a hw_error().
 
  Would there be a way to avoid terminating the guest for those cases? 
  How
  about we deassign the device upon error?
 
  By terminating the guest we contain the error vs allowing the guest to
  continue running with invalid data.  De-assigning the device is
  asynchronous and relies on guest involvement, so damage is potentially
  already done.  Is this a theoretical problem or do you actually have
  hardware that hits this?  Thanks,
 
  Alex
 
 
  This problem is in the context of a Hot-pluggable device assigned to the
  guest. If the guest rd/wr the config space at the same time than the
  device is physically taken out then the guest will terminate with
  hw_error().
 
  Because this limits the availability of the guest I think we should try
  to recover instead. I don't see what other damage can happen since
  guest's MMIO access to the stale device will go nowhere?
 
  So you're looking at implementing surprise device removal?  There's not
  just config space, there's slow bar access and mmap'd spaces to worry
  about too.  What does going nowhere mean?  If it means reads return -1
  and the guest is trying to read the data portion of a packet from the
  network or an hba, we've now passed bad data to the guest.  Thanks,
 
  Alex
 
 
 
 
  Thanks for your answer;
 
  Yes we are doing 'surprise device removal' for assigned device. Note
  that the problem also exist with standard 'attention button' device 
  removal.
 
  The problem is all about fault isolation. Ideally, only the
  corresponding driver should be affected by this 'surprise device
  removal'. I think that taking down the guest is too coarse. Think about
  a 'surprise device removal' on the host. In that case the host is not
  taken down so why not do the same with the guest?
 
  It depends on the host hardware.  Some x86 hardware will try to isolate
  the fault with an NMI other architectures such as ia64 would pull a
  machine check on a driver access to unresponsive devices.
 
  Yes some badness will be latched into the guest but really this not any
  different that having a mis-behaving device.
 
  ... which is a bad thing, but often undetectable.  This is detectable.
  Thanks,
 
  Alex
 
 
 Our hardware is throwing a surprise link down PCIe AER and we are acting 
 on it. I agree that for the generalized case NMI can be an issue.
 
 Let me ask you that question. What would be the best way to support 
 device removal (surprise or not) for guest assigned device then? How 
 about signaling the guest from vfio_pci_remove()?

Thanks for using vfio! :)

The 440fx chipset is really not designed to deal with these kinds of
problems.  Generally the best answer to how should we expose foo to the
guest is to do it exactly like it is on the host.  That means sending a
surprise link down aer to the guest.  That should be possible with q35.
We could potentially signal that in vfio_pci_remove, but we probably
want to figure out how to relay the aer event to the guest and inject it
into the emulated chipset.  Thanks,

Alex


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Xen-users] Recommendations for Virtulization Hardware

2012-09-21 Thread Andrew Bobulsky

(CC'ing Casey on this, as I recommend his setup for an Intel-based solution)

Hello ShadesOfGrey,

Hehehe, talk about timing ;)

On Sep 20, 2012, at 5:15 PM, ShadesOfGrey shades_of_g...@earthlink.net wrote:

 I'm looking to build a new personal computer.  I want it to function as a 
 Linux desktop, provide network services for my home, and lastly, occasional 
 Windows gaming.  From what I've gathered, virtualization using a Type 1 
 Hypervisor supporting PCI/VGA pass-through like KVM or Xen would be an 
 attractive solution for my needs.  For reference, reading these threads on 
 Ars Technica may be helpful to understand where I'm coming from, 
 http://arstechnica.com/civis/viewtopic.php?f=6t=1175674 and 
 http://arstechnica.com/civis/viewtopic.php?f=11t=1181867. But basically, I 
 use Linux as my primary OS and would rather avoid dual booting or building 
 two boxes just to play Windows games when I want to play Windows games.  I'm 
 also intrigued by the concept of virtualization and would like to experiment 
 with it as a solution for my case.

 My problem is isolating which hardware to choose, specifically which 
 combination of CPU, motherboard and video card.  Previously I had been 
 relying on web searches to glean information from gaming and enthusiast web 
 sites and tech specs from motherboard manufacturers.  After what I learned 
 during my participation in the referenced threads at Ars Technica, I find 
 myself back at square one.  Instead of trying to guess what hardware support 
 KVM  Xen, and vice versa.  I'd like to know what hardware KVM  Xen users 
 are actually using to run KVM  Xen? Particularly with consideration for 3D 
 gaming and current generation hardware, BTW.

 If there is need for further clarification, I'll answer any queries you might 
 have.

 ___
 Xen-users mailing list
 xen-us...@lists.xen.org
 http://lists.xen.org/xen-users

As one of the folks on the list who has done this---probably to the
most extreme degree---I can tell you it's good stuff.  It brings the
joys of datacenter consolidation to your gaming desktop, and also to
your wallet ;)

While my setup is now slightly dated, the 990FX chipset is still at
the top of the AMD offering, so you can shop around on CPUs, and buy a
cheaper secondary USB controller if you're not looking to cram in a
4-to-1 ratio.  I've never had much success passing through the onboard
USB from an AMD system, so I highly recommend picking up a little PCIe
x1 controller at the least.  That said, I'm convinced that highpoint
has one of the coincidentally-best products on the market for people
looking to do this, but I digress!

Take a look at, specifically, this post I made to the list some months
back, and I'll follow with some errata:
http://lists.xen.org/archives/html/xen-users/2012-05/msg00328.html

First, I've tested all of the hardware in the build that I
recommended, and indeed ended up building a four-headed unit. It works
like magic. Came in handy a few weeks ago when several of my friends
and I piled into a couple cars for a vacation where we wanted to play
games (yup, we're total nerds), but we couldn't fit four desktop cases
in addition to our stuff in the cars. :)

Second, by the time I got around to building it, the Antec One Hundred
wasn't available. Finding a case that supports 8 expansion slots is a
tough thing, but I found another similarly priced one, and it was a
dream to build.  I recommend it highly if you think you may want to
max out your slots and/or go deeper down the rabbit hole with
consolidated desktops:
http://www.newegg.com/Product/Product.aspx?Item=N82E1682238

Aside from being a very solid case for the price point (good features
for screwless installation as well), to give you an idea of the size,
it is laid out in such a way that I could fit dual-GPU cards in it
(Radeon 5970s).  I ultimately had to remove the HDD mounts to pull it
off, but you shouldn't have that problem... Mostly because AMDs dual
GPU cards won't work for this, so don't buy one for this build.  It's
a problem with the PCIe switching hardware (well, the firmware
thereof, probably) that they use.  I'll save you the rambling, but
let's just say that it should work, but doesn't :(

Also, the case does look good! ;)

Finally, and this is unfortunate, for the AMD build, *I* recommend you
use ESXi. While Xen _does_ work with the hardware that I've listed,
I've never been able to get the VMs to work properly with the GPLPV
drivers, and these are crucial to performance.  I really, really want
to bring this project back up on Xen though, and will try again now
that 4.2 has gone RTM.  If you aren't buying anytime soon and would
like to hit me up in a few weeks, by all means drop me a line, and
I'll let you know if I've gotten around to it.





So, for the Intel route!

Casey DeLorme has, just this week, posted a fantastic set of detailed
videos and documentation on his setup, where he

[PATCH] vfio: Fix virqfd release race

vfoi-pci supports a mechanism like KVM's irqfd for unmasking an
interrupt through an eventfd.  There are two ways to shutdown this
interface: 1) close the eventfd, 2) ioctl (such as disabling the
interrupt).  Both of these do the release through a workqueue,
which can result in a segfault if two jobs get queued for the same
virqfd.

Fix this by protecting the pointer to these virqfds by a spinlock.
The vfio pci device will therefore no longer have a reference to it
once the release job is queued under lock.  On the ioctl side, we
still flush the workqueue to ensure that any outstanding releases
are completed.

Signed-off-by: Alex Williamson alex.william...@redhat.com
---

 drivers/vfio/pci/vfio_pci_intrs.c |   76 +++--
 1 file changed, 56 insertions(+), 20 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
b/drivers/vfio/pci/vfio_pci_intrs.c
index 211a492..d8dedc7 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -76,9 +76,24 @@ static int virqfd_wakeup(wait_queue_t *wait, unsigned mode, 
int sync, void *key)
schedule_work(virqfd-inject);
}
 
-   if (flags  POLLHUP)
-   /* The eventfd is closing, detach from VFIO */
-   virqfd_deactivate(virqfd);
+   if (flags  POLLHUP) {
+   unsigned long flags;
+   spin_lock_irqsave(virqfd-vdev-irqlock, flags);
+
+   /*
+* The eventfd is closing, if the virqfd has not yet been
+* queued for release, as determined by testing whether the
+* vdev pointer to it is still valid, queue it now.  As
+* with kvm irqfds, we know we won't race against the virqfd
+* going away because we hold wqh-lock to get here.
+*/
+   if (*(virqfd-pvirqfd) == virqfd) {
+   *(virqfd-pvirqfd) = NULL;
+   virqfd_deactivate(virqfd);
+   }
+
+   spin_unlock_irqrestore(virqfd-vdev-irqlock, flags);
+   }
 
return 0;
 }
@@ -93,7 +108,6 @@ static void virqfd_ptable_queue_proc(struct file *file,
 static void virqfd_shutdown(struct work_struct *work)
 {
struct virqfd *virqfd = container_of(work, struct virqfd, shutdown);
-   struct virqfd **pvirqfd = virqfd-pvirqfd;
u64 cnt;
 
eventfd_ctx_remove_wait_queue(virqfd-eventfd, virqfd-wait, cnt);
@@ -101,7 +115,6 @@ static void virqfd_shutdown(struct work_struct *work)
eventfd_ctx_put(virqfd-eventfd);
 
kfree(virqfd);
-   *pvirqfd = NULL;
 }
 
 static void virqfd_inject(struct work_struct *work)
@@ -122,15 +135,11 @@ static int virqfd_enable(struct vfio_pci_device *vdev,
int ret = 0;
unsigned int events;
 
-   if (*pvirqfd)
-   return -EBUSY;
-
virqfd = kzalloc(sizeof(*virqfd), GFP_KERNEL);
if (!virqfd)
return -ENOMEM;
 
virqfd-pvirqfd = pvirqfd;
-   *pvirqfd = virqfd;
virqfd-vdev = vdev;
virqfd-handler = handler;
virqfd-thread = thread;
@@ -154,6 +163,23 @@ static int virqfd_enable(struct vfio_pci_device *vdev,
virqfd-eventfd = ctx;
 
/*
+* virqfds can be released by closing the eventfd or directly
+* through ioctl.  These are both done through a workqueue, so
+* we update the pointer to the virqfd under lock to avoid
+* pushing multiple jobs to release the same virqfd.
+*/
+   spin_lock_irq(vdev-irqlock);
+
+   if (*pvirqfd) {
+   spin_unlock_irq(vdev-irqlock);
+   ret = -EBUSY;
+   goto fail;
+   }
+   *pvirqfd = virqfd;
+
+   spin_unlock_irq(vdev-irqlock);
+
+   /*
 * Install our own custom wake-up handling so we are notified via
 * a callback whenever someone signals the underlying eventfd.
 */
@@ -187,19 +213,29 @@ fail:
fput(file);
 
kfree(virqfd);
-   *pvirqfd = NULL;
 
return ret;
 }
 
-static void virqfd_disable(struct virqfd *virqfd)
+static void virqfd_disable(struct vfio_pci_device *vdev,
+  struct virqfd **pvirqfd)
 {
-   if (!virqfd)
-   return;
+   unsigned long flags;
+
+   spin_lock_irqsave(vdev-irqlock, flags);
+
+   if (*pvirqfd) {
+   virqfd_deactivate(*pvirqfd);
+   *pvirqfd = NULL;
+   }
 
-   virqfd_deactivate(virqfd);
+   spin_unlock_irqrestore(vdev-irqlock, flags);
 
-   /* Block until we know all outstanding shutdown jobs have completed. */
+   /*
+* Block until we know all outstanding shutdown jobs have completed.
+* Even if we don't queue the job, flush the wq to be sure it's
+* been released.
+*/
flush_workqueue(vfio_irqfd_cleanup_wq);
 }
 
@@ -392,8 +428,8 @@ static int vfio_intx_set_signal(struct vfio_pci_device 
*vdev,

Re: pci-assign terminates the guest upon pread() / pwrite() error?

2012-09-21 Thread Etienne Martineau


On 09/21/2012 11:49 AM, Alex Williamson wrote:

On Fri, 2012-09-21 at 11:17 -0400, Etienne Martineau wrote:

On 09/20/2012 05:13 PM, Alex Williamson wrote:

On Thu, 2012-09-20 at 16:36 -0400, Etienne Martineau wrote:

On 09/20/2012 03:37 PM, Alex Williamson wrote:

On Thu, 2012-09-20 at 15:08 -0400, Etienne Martineau wrote:

On 09/20/2012 02:16 PM, Alex Williamson wrote:

On Thu, 2012-09-20 at 13:27 -0400, Etienne Martineau wrote:

In hw/kvm/pci-assign.c a pread() error part of assigned_dev_pci_read()
result in a hw_error(). Similarly a pwrite() error part of
assigned_dev_pci_write() also result in a hw_error().

Would there be a way to avoid terminating the guest for those cases? How
about we deassign the device upon error?


By terminating the guest we contain the error vs allowing the guest to
continue running with invalid data.  De-assigning the device is
asynchronous and relies on guest involvement, so damage is potentially
already done.  Is this a theoretical problem or do you actually have
hardware that hits this?  Thanks,

Alex



This problem is in the context of a Hot-pluggable device assigned to the
guest. If the guest rd/wr the config space at the same time than the
device is physically taken out then the guest will terminate with
hw_error().

Because this limits the availability of the guest I think we should try
to recover instead. I don't see what other damage can happen since
guest's MMIO access to the stale device will go nowhere?


So you're looking at implementing surprise device removal?  There's not
just config space, there's slow bar access and mmap'd spaces to worry
about too.  What does going nowhere mean?  If it means reads return -1
and the guest is trying to read the data portion of a packet from the
network or an hba, we've now passed bad data to the guest.  Thanks,

Alex





Thanks for your answer;

Yes we are doing 'surprise device removal' for assigned device. Note
that the problem also exist with standard 'attention button' device removal.

The problem is all about fault isolation. Ideally, only the
corresponding driver should be affected by this 'surprise device
removal'. I think that taking down the guest is too coarse. Think about
a 'surprise device removal' on the host. In that case the host is not
taken down so why not do the same with the guest?


It depends on the host hardware.  Some x86 hardware will try to isolate
the fault with an NMI other architectures such as ia64 would pull a
machine check on a driver access to unresponsive devices.


Yes some badness will be latched into the guest but really this not any
different that having a mis-behaving device.


... which is a bad thing, but often undetectable.  This is detectable.
Thanks,

Alex



Our hardware is throwing a surprise link down PCIe AER and we are acting
on it. I agree that for the generalized case NMI can be an issue.

Let me ask you that question. What would be the best way to support
device removal (surprise or not) for guest assigned device then? How
about signaling the guest from vfio_pci_remove()?


Thanks for using vfio! :)

The 440fx chipset is really not designed to deal with these kinds of
problems.  Generally the best answer to how should we expose foo to the
guest is to do it exactly like it is on the host.  That means sending a
surprise link down aer to the guest.  That should be possible with q35.
We are using q35 at this time for those reasons but the original qemu 
problem still exist. By the time the SPLD aer reached the guest, the 
device is physically gone on the host. Any transient guest MMIO/PCIcfg 
access to the stale assigned device can be fatal ( hw_error() ).



We could potentially signal that in vfio_pci_remove, but we probably
want to figure out how to relay the aer event to the guest and inject it
into the emulated chipset.
We tried that but there was some problems such as mangling the tlp to 
match the guest pci topology or the propagation latency caused by the 
chipset emulation layer during AER delivery. Right now we are using a 
straight lookup in the guest and fire the AER directly into the driver 
callback pci_error. We are doing that to minimize the exposition to the 
stale assigned device.


thanks,
Etienne
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC 1/2] kvm: Handle undercommitted guest case in PLE handler


On 09/21/2012 06:32 PM, Rik van Riel wrote:

On 09/21/2012 08:00 AM, Raghavendra K T wrote:

From: Raghavendra K T raghavendra...@linux.vnet.ibm.com

When total number of VCPUs of system is less than or equal to physical
CPUs,
PLE exits become costly since each VCPU can have dedicated PCPU, and
trying to find a target VCPU to yield_to just burns time in PLE handler.

This patch reduces overhead, by simply doing a return in such
scenarios by
checking the length of current cpu runqueue.


I am not convinced this is the way to go.

The VCPU that is holding the lock, and is not releasing it,
probably got scheduled out. That implies that VCPU is on a
runqueue with at least one other task.


I see your point here, we have two cases:

case 1)

rq1 : vcpu1-wait(lockA) (spinning)
rq2 : vcpu2-holding(lockA) (running)

Here Ideally vcpu1 should not enter PLE handler, since it would surely
get the lock within ple_window cycle. (assuming ple_window is tuned for
that workload perfectly).

May be this explains why we are not seeing benefit with kernbench.

On the other side, Since we cannot have a perfect ple_window tuned for
all type of workloads, for those workloads, which may need more than
4096 cycles, we gain. thinking is it that we are seeing in benefited
cases?

case 2)
rq1 : vcpu1-wait(lockA) (spinning)
rq2 : vcpu3 (running) ,  vcpu2-holding(lockA) [scheduled out]

I agree that checking rq1 length is not proper in this case, and as you
rightly pointed out, we are in trouble here. 
nr_running()/num_online_cpus() would give more accurate picture here,

but it seemed costly. May be load balancer save us a bit here in not
running to such sort of cases. ( I agree load balancer is far too
complex).

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/9] Revert mm: compaction: check lock contention first before taking lock

On Fri, Sep 21, 2012 at 11:46:15AM +0100, Mel Gorman wrote:
 This reverts
 mm-compaction-check-lock-contention-first-before-taking-lock.patch as it
 is replaced by a later patch in the series.
 
 Signed-off-by: Mel Gorman mgor...@suse.de

Acked-by: Rafael Aquini aqu...@redhat.com

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/9] Revert mm-compaction-abort-compaction-loop-if-lock-is-contended-or-run-too-long-fix

On Fri, Sep 21, 2012 at 11:46:16AM +0100, Mel Gorman wrote:
 This reverts
 mm-compaction-abort-compaction-loop-if-lock-is-contended-or-run-too-long-fix
 as it is replaced by a later patch in the series.
 
 Signed-off-by: Mel Gorman mgor...@suse.de

Acked-by: Rafael Aquini aqu...@redhat.com

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 3/9] Revert mm: compaction: abort compaction loop if lock is contended or run too long

On Fri, Sep 21, 2012 at 11:46:17AM +0100, Mel Gorman wrote:
 This reverts
 mm-compaction-abort-compaction-loop-if-lock-is-contended-or-run-too-long.patch
 as it is replaced by a later patch in the series.
 
 Signed-off-by: Mel Gorman mgor...@suse.de

Acked-by: Rafael Aquini aqu...@redhat.com

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC 2/2] kvm: Be courteous to other VMs in overcommitted scenario in PLE handler


On 09/21/2012 07:22 PM, Rik van Riel wrote:

On 09/21/2012 09:46 AM, Takuya Yoshikawa wrote:

On Fri, 21 Sep 2012 17:30:20 +0530
Raghavendra K T raghavendra...@linux.vnet.ibm.com wrote:


From: Raghavendra K T raghavendra...@linux.vnet.ibm.com

When PLE handler fails to find a better candidate to yield_to, it
goes back and does spin again. This is acceptable when we do not
have overcommit.
But in overcommitted scenarios (especially when we have large
number of small guests), it is better to yield.

Reviewed-by: Srikar Dronamraju sri...@linux.vnet.ibm.com
Signed-off-by: Raghavendra K T raghavendra...@linux.vnet.ibm.com
---
virt/kvm/kvm_main.c | 4 
1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8323685..713b677 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1660,6 +1660,10 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
}
}
}
+ /* In overcommitted cases, yield instead of spinning */
+ if (!yielded  rq_nr_running()  1)
+ schedule();


How about doing cond_resched() instead?


Actually, an actual call to yield() may be better.

That will set scheduler hints to make the scheduler pick
another task for one round, while preserving this task's
top position in the runqueue.


I am not a scheduler expert, but I am also inclined towards
Rik's suggestion here since we set skip buddy here. Takuya?

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 4/9] mm: compaction: Abort compaction loop if lock is contended or run too long

On Fri, Sep 21, 2012 at 11:46:18AM +0100, Mel Gorman wrote:
 From: Shaohua Li s...@fusionio.com
 
 Changelog since V2
 o Fix BUG_ON triggered due to pages left on cc.migratepages
 o Make compact_zone_order() require non-NULL arg `contended'
 
 Changelog since V1
 o only abort the compaction if lock is contended or run too long
 o Rearranged the code by Andrea Arcangeli.
 
 isolate_migratepages_range() might isolate no pages if for example when
 zone-lru_lock is contended and running asynchronous compaction. In this
 case, we should abort compaction, otherwise, compact_zone will run a
 useless loop and make zone-lru_lock is even contended.
 
 [minc...@kernel.org: Putback pages isolated for migration if aborting]
 [a...@linux-foundation.org: compact_zone_order requires non-NULL arg 
 contended]
 Signed-off-by: Andrea Arcangeli aarca...@redhat.com
 Signed-off-by: Shaohua Li s...@fusionio.com
 Signed-off-by: Mel Gorman mgor...@suse.de
 Acked-by: Rik van Riel r...@redhat.com
 ---

Acked-by: Rafael Aquini aqu...@redhat.com

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 5/9] mm: compaction: Acquire the zone-lru_lock as late as possible

On Fri, Sep 21, 2012 at 11:46:19AM +0100, Mel Gorman wrote:
 Compactions migrate scanner acquires the zone-lru_lock when scanning a range
 of pages looking for LRU pages to acquire. It does this even if there are
 no LRU pages in the range. If multiple processes are compacting then this
 can cause severe locking contention. To make matters worse commit b2eef8c0
 (mm: compaction: minimise the time IRQs are disabled while isolating pages
 for migration) releases the lru_lock every SWAP_CLUSTER_MAX pages that are
 scanned.
 
 This patch makes two changes to how the migrate scanner acquires the LRU
 lock. First, it only releases the LRU lock every SWAP_CLUSTER_MAX pages if
 the lock is contended. This reduces the number of times it unnecessarily
 disables and re-enables IRQs. The second is that it defers acquiring the
 LRU lock for as long as possible. If there are no LRU pages or the only
 LRU pages are transhuge then the LRU lock will not be acquired at all
 which reduces contention on zone-lru_lock.
 
 Signed-off-by: Mel Gorman mgor...@suse.de
 Acked-by: Rik van Riel r...@redhat.com
 ---

Acked-by: Rafael Aquini aqu...@redhat.com

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 6/9] mm: compaction: Acquire the zone-lock as late as possible

On Fri, Sep 21, 2012 at 11:46:20AM +0100, Mel Gorman wrote:
 Compactions free scanner acquires the zone-lock when checking for PageBuddy
 pages and isolating them. It does this even if there are no PageBuddy pages
 in the range.
 
 This patch defers acquiring the zone lock for as long as possible. In the
 event there are no free pages in the pageblock then the lock will not be
 acquired at all which reduces contention on zone-lock.
 
 Signed-off-by: Mel Gorman mgor...@suse.de
 Acked-by: Rik van Riel r...@redhat.com
 ---

Acked-by: Rafael Aquini aqu...@redhat.com

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 7/9] Revert mm: have order 0 compaction start off where it left

On Fri, Sep 21, 2012 at 11:46:21AM +0100, Mel Gorman wrote:
 This reverts commit 7db8889a (mm: have order  0 compaction start off
 where it left) and commit de74f1cc (mm: have order  0 compaction start
 near a pageblock with free pages). These patches were a good idea and
 tests confirmed that they massively reduced the amount of scanning but
 the implementation is complex and tricky to understand. A later patch
 will cache what pageblocks should be skipped and reimplements the
 concept of compact_cached_free_pfn on top for both migration and
 free scanners.
 
 Signed-off-by: Mel Gorman mgor...@suse.de
 Acked-by: Rik van Riel r...@redhat.com
 ---

Acked-by: Rafael Aquini aqu...@redhat.com

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 8/9] mm: compaction: Cache if a pageblock was scanned and no pages were isolated

On Fri, Sep 21, 2012 at 11:46:22AM +0100, Mel Gorman wrote:
 When compaction was implemented it was known that scanning could potentially
 be excessive. The ideal was that a counter be maintained for each pageblock
 but maintaining this information would incur a severe penalty due to a
 shared writable cache line. It has reached the point where the scanning
 costs are an serious problem, particularly on long-lived systems where a
 large process starts and allocates a large number of THPs at the same time.
 
 Instead of using a shared counter, this patch adds another bit to the
 pageblock flags called PG_migrate_skip. If a pageblock is scanned by
 either migrate or free scanner and 0 pages were isolated, the pageblock
 is marked to be skipped in the future. When scanning, this bit is checked
 before any scanning takes place and the block skipped if set.
 
 The main difficulty with a patch like this is when to ignore the cached
 information? If it's ignored too often, the scanning rates will still
 be excessive. If the information is too stale then allocations will fail
 that might have otherwise succeeded. In this patch
 
 o CMA always ignores the information
 o If the migrate and free scanner meet then the cached information will
   be discarded if it's at least 5 seconds since the last time the cache
   was discarded
 o If there are a large number of allocation failures, discard the cache.
 
 The time-based heuristic is very clumsy but there are few choices for a
 better event. Depending solely on multiple allocation failures still allows
 excessive scanning when THP allocations are failing in quick succession
 due to memory pressure. Waiting until memory pressure is relieved would
 cause compaction to continually fail instead of using reclaim/compaction
 to try allocate the page. The time-based mechanism is clumsy but a better
 option is not obvious.
 
 Signed-off-by: Mel Gorman mgor...@suse.de
 Acked-by: Rik van Riel r...@redhat.com
 ---

Acked-by: Rafael Aquini aqu...@redhat.com

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 9/9] mm: compaction: Restart compaction from near where it left off

On Fri, Sep 21, 2012 at 11:46:23AM +0100, Mel Gorman wrote:
 This is almost entirely based on Rik's previous patches and discussions
 with him about how this might be implemented.
 
 Order  0 compaction stops when enough free pages of the correct page
 order have been coalesced.  When doing subsequent higher order allocations,
 it is possible for compaction to be invoked many times.
 
 However, the compaction code always starts out looking for things to compact
 at the start of the zone, and for free pages to compact things to at the
 end of the zone.
 
 This can cause quadratic behaviour, with isolate_freepages starting at
 the end of the zone each time, even though previous invocations of the
 compaction code already filled up all free memory on that end of the zone.
 This can cause isolate_freepages to take enormous amounts of CPU with
 certain workloads on larger memory systems.
 
 This patch caches where the migration and free scanner should start from on
 subsequent compaction invocations using the pageblock-skip information. When
 compaction starts it begins from the cached restart points and will
 update the cached restart points until a page is isolated or a pageblock
 is skipped that would have been scanned by synchronous compaction.
 
 Signed-off-by: Mel Gorman mgor...@suse.de
 Acked-by: Rik van Riel r...@redhat.com
 ---

Acked-by: Rafael Aquini aqu...@redhat.com

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v11] kvm: Add resampling irqfds for level triggered interrupts

To emulate level triggered interrupts, add a resample option to
KVM_IRQFD.  When specified, a new resamplefd is provided that notifies
the user when the irqchip has been resampled by the VM.  This may, for
instance, indicate an EOI.  Also in this mode, posting of an interrupt
through an irqfd only asserts the interrupt.  On resampling, the
interrupt is automatically de-asserted prior to user notification.
This enables level triggered interrupts to be posted and re-enabled
from vfio with no userspace intervention.

All resampling irqfds can make use of a single irq source ID, so we
reserve a new one for this interface.

Signed-off-by: Alex Williamson alex.william...@redhat.com
---

Taking Michael's idea to create our own lock for resampling irqfds
simplifies things significantly.  We can now avoid any nesting with
the irqfds.lock spinlock so we can call everything we need to
directly.

 Documentation/virtual/kvm/api.txt |   13 +++
 arch/x86/kvm/x86.c|4 +
 include/linux/kvm.h   |   12 +++
 include/linux/kvm_host.h  |5 +
 virt/kvm/eventfd.c|  150 -
 virt/kvm/irq_comm.c   |6 +
 6 files changed, 184 insertions(+), 6 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index bf33aaa..6240366 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1946,6 +1946,19 @@ the guest using the specified gsi pin.  The irqfd is 
removed using
 the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd
 and kvm_irqfd.gsi.
 
+With KVM_CAP_IRQFD_RESAMPLE, KVM_IRQFD supports a de-assert and notify
+mechanism allowing emulation of level-triggered, irqfd-based
+interrupts.  When KVM_IRQFD_FLAG_RESAMPLE is set the user must pass an
+additional eventfd in the kvm_irqfd.resamplefd field.  When operating
+in resample mode, posting of an interrupt through kvm_irq.fd asserts
+the specified gsi in the irqchip.  When the irqchip is resampled, such
+as from an EOI, the gsi is de-asserted and the user is notifed via
+kvm_irqfd.resamplefd.  It is the user's responsibility to re-queue
+the interrupt if the device making use of it still requires service.
+Note that closing the resamplefd is not sufficient to disable the
+irqfd.  The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment
+and need not be specified with KVM_IRQFD_FLAG_DEASSIGN.
+
 4.76 KVM_PPC_ALLOCATE_HTAB
 
 Capability: KVM_CAP_PPC_ALLOC_HTAB
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2966c84..56f9002 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2177,6 +2177,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_GET_TSC_KHZ:
case KVM_CAP_PCI_2_3:
case KVM_CAP_KVMCLOCK_CTRL:
+   case KVM_CAP_IRQFD_RESAMPLE:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -6268,6 +6269,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, kvm-arch.irq_sources_bitmap);
+   /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
+   set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+   kvm-arch.irq_sources_bitmap);
 
raw_spin_lock_init(kvm-arch.tsc_write_lock);
 
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 2ce09aa..a01a3d5 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -618,6 +618,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_GET_SMMU_INFO 78
 #define KVM_CAP_S390_COW 79
 #define KVM_CAP_PPC_ALLOC_HTAB 80
+#define KVM_CAP_IRQFD_RESAMPLE 81
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -683,12 +684,21 @@ struct kvm_xen_hvm_config {
 #endif
 
 #define KVM_IRQFD_FLAG_DEASSIGN (1  0)
+/*
+ * Available with KVM_CAP_IRQFD_RESAMPLE
+ *
+ * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies
+ * the irqfd to operate in resampling mode for level triggered interrupt
+ * emlation.  See Documentation/virtual/kvm/api.txt.
+ */
+#define KVM_IRQFD_FLAG_RESAMPLE (1  1)
 
 struct kvm_irqfd {
__u32 fd;
__u32 gsi;
__u32 flags;
-   __u8  pad[20];
+   __u32 resamplefd;
+   __u8  pad[16];
 };
 
 struct kvm_clock_data {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b70b48b..6966ce2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -70,7 +70,8 @@
 #define KVM_REQ_PMU   16
 #define KVM_REQ_PMI   17
 
-#define KVM_USERSPACE_IRQ_SOURCE_ID0
+#define KVM_USERSPACE_IRQ_SOURCE_ID0
+#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID   1
 
 struct kvm;
 struct kvm_vcpu;
@@ -283,6 +284,8 @@ struct kvm {
struct {
spinlock_tlock;
struct list_head  items;
+   struct list_head  resampler_list;
+   struct mutex  resampler_lock;
} irqfds;

Re: [Qemu-devel] [PATCH v5 0/4] VFIO-based PCI device assignment

Ping.  There don't seem to be any objections to this.  Thanks,

Alex


On Fri, 2012-09-14 at 17:04 -0600, Alex Williamson wrote:
 On Fri, 2012-09-14 at 17:01 -0600, Alex Williamson wrote:
  Same goodness as v4, plus:
  
   - Addressed comments by Blue Swirl (thanks for the review)
 (hopefully w/o breaking anything wrt slow bar endianness)
   - Fixed a couple checkpatch warnings that snuck in
  
  BTW, this works fine with Jason's Q35 patches though we will
  need to add INTx routing support for KVM accelerated INTx
  (and pci-assign).  Thanks,
 
 Oh, as usual, the following branch is updated:
 
 git://github.com/awilliam/qemu-vfio.git vfio-for-qemu
 
 and a new tag is added:
 
 vfio-pci-for-qemu-v5
 
 Thanks,
 
 Alex
 
  ---
  
  Alex Williamson (4):
vfio: Enable vfio-pci and mark supported
vfio: vfio-pci device assignment driver
Update Linux kernel headers
Update kernel header script to include vfio
  
  
   MAINTAINERS |5 
   configure   |6 
   hw/Makefile.objs|3 
   hw/vfio_pci.c   | 1864 
  +++
   hw/vfio_pci_int.h   |  114 ++
   linux-headers/linux/vfio.h  |  368 
   scripts/update-linux-headers.sh |2 
   7 files changed, 2360 insertions(+), 2 deletions(-)
   create mode 100644 hw/vfio_pci.c
   create mode 100644 hw/vfio_pci_int.h
   create mode 100644 linux-headers/linux/vfio.h
 
 
 
 



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 1/1] target-i386: Fix default Hypervisor level for hypervisor-vendor=kvm.


On 09/21/12 10:18, Eduardo Habkost wrote:

On Thu, Sep 20, 2012 at 04:06:27PM -0400, Don Slutz wrote:

 From http://lkml.indiana.edu/hypermail/linux/kernel/1205.0/00100.html
EAX should be KVM_CPUID_FEATURES (0x4001) not 0.

Added hypervisor-vendor=kvm0 to get the older CPUID result. kvm1 selects the 
newer one.

Why not just make hypervisor-vendor=kvm control only the hypervisor
vendor string, and support something like kvm-hypervisor-level=0 to
restore the old cpuid_hv_level=0 behavior?

  -cpu host,hypervisor-vendor=kvm,hypervisor-level=0

 Does this.


This is similar to the kvmclock case: it would allow us to make
hypervisor-vendor=kvm use saner values as default, but letting old
machine-types to override it for compatibility if required.
Right now since I am using env-cpuid_hv_level == 0 as a flag. This 
means that:


  -cpu host,hypervisor-level=0,hypervisor-vendor=kvm

  -cpu host,hypervisor-vendor=kvm,hypervisor-level=0

end up with different CPUID data (Which I do not like). I will fix this in the 
next round.

Did you want me to drop kvm0 and kvm1?
   -Don

[...]

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [libvirt] TSC scaling interface to management

2012-09-21 Thread Dor Laor


On 09/21/2012 05:51 AM, Marcelo Tosatti wrote:

On Fri, Sep 21, 2012 at 12:02:46AM +0300, Dor Laor wrote:

On 09/12/2012 06:39 PM, Marcelo Tosatti wrote:



HW TSC scaling is a feature of AMD processors that allows a
multiplier to be specified to the TSC frequency exposed to the guest.

KVM also contains provision to trap TSC (KVM: Infrastructure for
software and hardware based TSC rate scaling cc578287e3224d0da)
or advance TSC frequency.

This is useful when migrating to a host with different frequency and
the guest is possibly using direct RDTSC instructions for purposes
other than measuring cycles (that is, it previously calculated
cycles-per-second, and uses that information which is stale after
migration).

qemu-x86: Set tsc_khz in kvm when supported (e7429073ed1a76518)
added support for tsc_khz= option in QEMU.

I am proposing the following changes so that management applications
can work with this:

1) New option for tsc_khz, which is tsc_khz=host (QEMU command line
option). Host means that QEMU is responsible for retrieving the
TSC frequency of the host processor and use that.
Management application does not have to deal with the burden.

2) New subsection with tsc_khz value. Destination host should consult
supported features of running kernel and fail if feature is unsupported.


It is not necessary to use this tsc_khz setting with modern guests
using paravirtual clocks, or when its known that applications make
proper use of the time interface provided by operating systems.

On the other hand, legacy applications or setups which require no
modification and correct operation while virtualized and make
use of RDTSC might need this.

Therefore it appears that this tsc_khz=auto option can be specified
only if the user specifies so (it can be a per-guest flag hidden
in the management configuration/manual).

Sending this email to gather suggestions (or objections)
to this interface.


I'm not sure I understand the exact difference between the offers.
We can define these 3 options:

1. Qemu/kvm won't make use of tsc scaling feature at all.
2. tsc scaling is used and we take the value either from the host or
from the live migration data that overrides the later for incoming.
As you've said, it should be passed through a sub section.
3. Manual setting of the value (uncommon).

Is there another option worth considering?
The questions is what should be the default. IMHO #2 is more
appropriate to serve as a default since we do expect tsc to change
between hosts.


Option 1. is more appropriate to serve as a default given that
modern guests make use of paravirt, as you have observed.


but you also observed that legacy applications that use rdtsc (even over 
pv kernel) will still be affected by the physical tsc frequency. Since 
I'm not aware of downside for using scaling, I rather pick opt #2 as a 
default.




That is, tsc scaling is only required if the guest does direct RDTSC
on the expectation that the value won't change.


Cheers,
Dor

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 1/1] target-i386: Fix default Hypervisor level for hypervisor-vendor=kvm.



On 09/21/12 16:49, Eduardo Habkost wrote:

On Fri, Sep 21, 2012 at 04:26:58PM -0400, Don Slutz wrote:

On 09/21/12 10:18, Eduardo Habkost wrote:

On Thu, Sep 20, 2012 at 04:06:27PM -0400, Don Slutz wrote:

 From http://lkml.indiana.edu/hypermail/linux/kernel/1205.0/00100.html
EAX should be KVM_CPUID_FEATURES (0x4001) not 0.

Added hypervisor-vendor=kvm0 to get the older CPUID result. kvm1 selects the 
newer one.

Why not just make hypervisor-vendor=kvm control only the hypervisor
vendor string, and support something like kvm-hypervisor-level=0 to
restore the old cpuid_hv_level=0 behavior?

   -cpu host,hypervisor-vendor=kvm,hypervisor-level=0

  Does this.

Good.  :-)


This is similar to the kvmclock case: it would allow us to make
hypervisor-vendor=kvm use saner values as default, but letting old
machine-types to override it for compatibility if required.

Right now since I am using env-cpuid_hv_level == 0 as a flag. This
means that:

   -cpu host,hypervisor-level=0,hypervisor-vendor=kvm

   -cpu host,hypervisor-vendor=kvm,hypervisor-level=0

end up with different CPUID data (Which I do not like). I will fix this in the 
next round.

Right. This has to be fixed.


Did you want me to drop kvm0 and kvm1?

Yes, if level is already configurable using the hypervisor-level
property, I don't see the need for kvm0 and kvm1.

If you make kvm_arch_init_vcpu() actually use those fields, you will end
up implementing what's required to allow migration compatibility to be
kept (the only thing missing is to make the CPU class a child of
DeviceState, and add hypervisor-level=0 to the existing machine-types).
:-)


You mean like
  http://lists.gnu.org/archive/html/qemu-devel/2012-09/msg03402.html
and
  http://lists.gnu.org/archive/html/qemu-devel/2012-09/msg03405.html
already change kvm_arch_init_vcpu().  I did not know that I need to make 
the CPU class a child of DeviceState.
Nor that I needed to add hypervisor-vendor=kvm,hypervisor-level=0 to the 
existing machine-types.  Since without specifying hypervisor-level=0 it 
defaults to 0 and kvm_arch_init_vcpu() will default to setting 
hypervisor-vendor=kvm.

   -Don
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 4/9] mm: compaction: Abort compaction loop if lock is contended or run too long

2012-09-21 Thread Andrew Morton

On Fri, 21 Sep 2012 11:46:18 +0100
Mel Gorman mgor...@suse.de wrote:

 Changelog since V2
 o Fix BUG_ON triggered due to pages left on cc.migratepages
 o Make compact_zone_order() require non-NULL arg `contended'
 
 Changelog since V1
 o only abort the compaction if lock is contended or run too long
 o Rearranged the code by Andrea Arcangeli.
 
 isolate_migratepages_range() might isolate no pages if for example when
 zone-lru_lock is contended and running asynchronous compaction. In this
 case, we should abort compaction, otherwise, compact_zone will run a
 useless loop and make zone-lru_lock is even contended.

hm, this appears to be identical to

mm-compaction-abort-compaction-loop-if-lock-is-contended-or-run-too-long.patch
mm-compaction-abort-compaction-loop-if-lock-is-contended-or-run-too-long-fix.patch
mm-compaction-abort-compaction-loop-if-lock-is-contended-or-run-too-long-fix-2.patch

so I simply omitted patches 2, 3 and 4.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 6/9] mm: compaction: Acquire the zone-lock as late as possible

2012-09-21 Thread Andrew Morton

On Fri, 21 Sep 2012 11:46:20 +0100
Mel Gorman mgor...@suse.de wrote:

 Compactions free scanner acquires the zone-lock when checking for PageBuddy
 pages and isolating them. It does this even if there are no PageBuddy pages
 in the range.
 
 This patch defers acquiring the zone lock for as long as possible. In the
 event there are no free pages in the pageblock then the lock will not be
 acquired at all which reduces contention on zone-lock.

 ...

 --- a/mm/compaction.c
 +++ b/mm/compaction.c
 @@ -93,6 +93,28 @@ static inline bool compact_trylock_irqsave(spinlock_t 
 *lock,
   return compact_checklock_irqsave(lock, flags, false, cc);
  }
  
 +/* Returns true if the page is within a block suitable for migration to */
 +static bool suitable_migration_target(struct page *page)
 +{
 +

stray newline

 + int migratetype = get_pageblock_migratetype(page);
 +
 + /* Don't interfere with memory hot-remove or the min_free_kbytes blocks 
 */
 + if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
 + return false;
 +
 + /* If the page is a large free page, then allow migration */
 + if (PageBuddy(page)  page_order(page) = pageblock_order)
 + return true;
 +
 + /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
 + if (migrate_async_suitable(migratetype))
 + return true;
 +
 + /* Otherwise skip the block */
 + return false;
 +}
 +

 ...

 @@ -168,23 +193,38 @@ static unsigned long isolate_freepages_block(unsigned 
 long blockpfn,
   int isolated, i;
   struct page *page = cursor;
  
 - if (!pfn_valid_within(blockpfn)) {
 - if (strict)
 - return 0;
 - continue;
 - }
 + if (!pfn_valid_within(blockpfn))
 + goto strict_check;
   nr_scanned++;
  
 - if (!PageBuddy(page)) {
 - if (strict)
 - return 0;
 - continue;
 - }
 + if (!PageBuddy(page))
 + goto strict_check;
 +
 + /*
 +  * The zone lock must be held to isolate freepages. This
 +  * unfortunately this is a very coarse lock and can be

this this

 +  * heavily contended if there are parallel allocations
 +  * or parallel compactions. For async compaction do not
 +  * spin on the lock and we acquire the lock as late as
 +  * possible.
 +  */
 + locked = compact_checklock_irqsave(cc-zone-lock, flags,
 + locked, cc);
 + if (!locked)
 + break;
 +
 + /* Recheck this is a suitable migration target under lock */
 + if (!strict  !suitable_migration_target(page))
 + break;
 +
 + /* Recheck this is a buddy page under lock */
 + if (!PageBuddy(page))
 + goto strict_check;
  
   /* Found a free page, break it into order-0 pages */
   isolated = split_free_page(page);
   if (!isolated  strict)
 - return 0;
 + goto strict_check;
   total_isolated += isolated;
   for (i = 0; i  isolated; i++) {
   list_add(page-lru, freelist);
 @@ -196,9 +236,23 @@ static unsigned long isolate_freepages_block(unsigned 
 long blockpfn,
   blockpfn += isolated - 1;
   cursor += isolated - 1;
   }
 +
 + continue;
 +
 +strict_check:
 + /* Abort isolation if the caller requested strict isolation */
 + if (strict) {
 + total_isolated = 0;
 + goto out;
 + }
   }
  
   trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
 +
 +out:
 + if (locked)
 + spin_unlock_irqrestore(cc-zone-lock, flags);
 +
   return total_isolated;
  }

A a few things about this function.

Would it be cleaner if we did

if (!strict) {
if (!suitable_migration_target(page))
break;
} else {
if (!PageBuddy(page))
goto strict_check;
}

and then remove the test of `strict' from strict_check (which then
should be renamed)?

Which perhaps means that the whole `strict_check' block can go away:

if (!strict) {
if (!suitable_migration_target(page))
break;
} else {
if (!PageBuddy(page)) {
total_isolated = 0;
goto out;
}

Have a think about it?  The function is a little straggly.

Secondly, it is correct/desirable to skip the (now misnamed

Re: [PATCH 8/9] mm: compaction: Cache if a pageblock was scanned and no pages were isolated

2012-09-21 Thread Andrew Morton

On Fri, 21 Sep 2012 11:46:22 +0100
Mel Gorman mgor...@suse.de wrote:

 When compaction was implemented it was known that scanning could potentially
 be excessive. The ideal was that a counter be maintained for each pageblock
 but maintaining this information would incur a severe penalty due to a
 shared writable cache line. It has reached the point where the scanning
 costs are an serious problem, particularly on long-lived systems where a
 large process starts and allocates a large number of THPs at the same time.
 
 Instead of using a shared counter, this patch adds another bit to the
 pageblock flags called PG_migrate_skip. If a pageblock is scanned by
 either migrate or free scanner and 0 pages were isolated, the pageblock
 is marked to be skipped in the future. When scanning, this bit is checked
 before any scanning takes place and the block skipped if set.
 
 The main difficulty with a patch like this is when to ignore the cached
 information? If it's ignored too often, the scanning rates will still
 be excessive. If the information is too stale then allocations will fail
 that might have otherwise succeeded. In this patch
 
 o CMA always ignores the information
 o If the migrate and free scanner meet then the cached information will
   be discarded if it's at least 5 seconds since the last time the cache
   was discarded
 o If there are a large number of allocation failures, discard the cache.
 
 The time-based heuristic is very clumsy but there are few choices for a
 better event. Depending solely on multiple allocation failures still allows
 excessive scanning when THP allocations are failing in quick succession
 due to memory pressure. Waiting until memory pressure is relieved would
 cause compaction to continually fail instead of using reclaim/compaction
 to try allocate the page. The time-based mechanism is clumsy but a better
 option is not obvious.

ick.

Wall time has sooo little relationship to what's happening in there. 
If we *have* to use polling, cannot we clock the poll with some metric
which is at least vaguely related to the amount of activity?  Number
(or proportion) of pages scanned, for example?  Or reset everything on
the Nth trip around the zone?  Or even a combination of one of these
*and* of wall time, so the system will at least work harder when MM is
under load.

Also, what has to be done to avoid the polling altogether?  eg/ie, zap
a pageblock's PB_migrate_skip synchronously, when something was done to
that pageblock which justifies repolling it?


 ...

 +static void reset_isolation_suitable(struct zone *zone)
 +{
 + unsigned long start_pfn = zone-zone_start_pfn;
 + unsigned long end_pfn = zone-zone_start_pfn + zone-spanned_pages;
 + unsigned long pfn;
 +
 + /*
 +  * Do not reset more than once every five seconds. If allocations are
 +  * failing sufficiently quickly to allow this to happen then continually
 +  * scanning for compaction is not going to help. The choice of five
 +  * seconds is arbitrary but will mitigate excessive scanning.
 +  */
 + if (time_before(jiffies, zone-compact_blockskip_expire))
 + return;
 + zone-compact_blockskip_expire = jiffies + (HZ * 5);
 +
 + /* Walk the zone and mark every pageblock as suitable for isolation */
 + for (pfn = start_pfn; pfn  end_pfn; pfn += pageblock_nr_pages) {
 + struct page *page;
 + if (!pfn_valid(pfn))
 + continue;
 +
 + page = pfn_to_page(pfn);
 + if (zone != page_zone(page))
 + continue;
 +
 + clear_pageblock_skip(page);
 + }

What's the worst-case loop count here?

 +}
 +

 ...


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 1/1] target-i386: Fix default Hypervisor level for hypervisor-vendor=kvm.

2012-09-21 Thread Eduardo Habkost

On Fri, Sep 21, 2012 at 05:28:27PM -0400, Don Slutz wrote:
 
 On 09/21/12 16:49, Eduardo Habkost wrote:
 On Fri, Sep 21, 2012 at 04:26:58PM -0400, Don Slutz wrote:
 On 09/21/12 10:18, Eduardo Habkost wrote:
 On Thu, Sep 20, 2012 at 04:06:27PM -0400, Don Slutz wrote:
  From http://lkml.indiana.edu/hypermail/linux/kernel/1205.0/00100.html
 EAX should be KVM_CPUID_FEATURES (0x4001) not 0.
 
 Added hypervisor-vendor=kvm0 to get the older CPUID result. kvm1 selects 
 the newer one.
 Why not just make hypervisor-vendor=kvm control only the hypervisor
 vendor string, and support something like kvm-hypervisor-level=0 to
 restore the old cpuid_hv_level=0 behavior?
-cpu host,hypervisor-vendor=kvm,hypervisor-level=0
 
   Does this.
 Good.  :-)
 
 This is similar to the kvmclock case: it would allow us to make
 hypervisor-vendor=kvm use saner values as default, but letting old
 machine-types to override it for compatibility if required.
 Right now since I am using env-cpuid_hv_level == 0 as a flag. This
 means that:
 
-cpu host,hypervisor-level=0,hypervisor-vendor=kvm
 
-cpu host,hypervisor-vendor=kvm,hypervisor-level=0
 
 end up with different CPUID data (Which I do not like). I will fix this in 
 the next round.
 Right. This has to be fixed.
 
 Did you want me to drop kvm0 and kvm1?
 Yes, if level is already configurable using the hypervisor-level
 property, I don't see the need for kvm0 and kvm1.
 
 If you make kvm_arch_init_vcpu() actually use those fields, you will end
 up implementing what's required to allow migration compatibility to be
 kept (the only thing missing is to make the CPU class a child of
 DeviceState, and add hypervisor-level=0 to the existing machine-types).
 :-)
 
 You mean like
   http://lists.gnu.org/archive/html/qemu-devel/2012-09/msg03402.html
 and
   http://lists.gnu.org/archive/html/qemu-devel/2012-09/msg03405.html
 already change kvm_arch_init_vcpu().

Yes! Sorry, I hadn't reviewed all of your previous series yet.  :-)

 I did not know that I need to
 make the CPU class a child of DeviceState.

You don't. But we plan to do that, eventually, so we can put
CPU compatibility properties into machine-types.

 Nor that I needed to add hypervisor-vendor=kvm,hypervisor-level=0 to
 the existing machine-types.  Since without specifying
 hypervisor-level=0 it defaults to 0 and kvm_arch_init_vcpu() will
 default to setting hypervisor-vendor=kvm.

What I would like to do is: to change the default to 0x4001 (that's
the correct value), but make the pc-1.1 and older machine-types keep the
hypervisor-level=0 default for compatibility.

(But to be able to do that, we need to first make the CPU class a child
of DeviceState, so that's something to be done later)

-- 
Eduardo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH v3 11/19] Implement qmp and hmp commands for notification lists

2012-09-21 Thread Eric Blake

On 09/21/2012 05:17 AM, Vasilis Liaskovitis wrote:
 Guest can respond to ACPI hotplug events e.g. with _EJ or _OST method.
 This patch implements a tail queue to store guest notifications for memory
 hot-add and hot-remove requests.
 
 Guest responses for memory hotplug command on a per-dimm basis can be detected
 with the new hmp command info memhp or the new qmp command query-memhp

Naming doesn't match the QMP code.

 Examples:
 
 (qemu) device_add dimm,id=ram0

 
 These notification items should probably be part of migration state (not yet
 implemented).

In the case of libvirt driving migration, you already said in 10/19 that
libvirt has to start the destination with the populated=on|off fields
correct for each dimm according to the state it was in at the time the
host started the update.  Can the host hot unplug memory after migration
has started?

 +
 +##
 +# @MemHpInfo:
 +#
 +# Information about status of a memory hotplug command
 +#
 +# @dimm: the Dimm associated with the result
 +#
 +# @result: the result of the hotplug command
 +#
 +# Since: 1.3
 +#
 +##
 +{ 'type': 'MemHpInfo',
 +  'data': {'dimm': 'str', 'request': 'str', 'result': 'str'} }

Should 'result' be a bool (true for success, false for still pending) or
an enum, instead of a free-form string?  Likewise, isn't 'request' going
to be exactly one of two values (plug or unplug)?

-- 
Eric Blake   ebl...@redhat.com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

Re: [RFC PATCH v3 12/19] Implement info memory-total and query-memory-total

2012-09-21 Thread Eric Blake

On 09/21/2012 05:17 AM, Vasilis Liaskovitis wrote:
 Returns total physical memory available to guest in bytes, including 
 hotplugged
 memory. Note that the number reported here may be different from what the 
 guest
 sees e.g. if the guest has not logically onlined hotplugged memory.
 
 This functionality is provided independently of a balloon device, since a
 guest can be using ACPI memory hotplug without using a balloon device.
 
 Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com

 +++ b/hmp-commands.hx
 @@ -1464,6 +1464,8 @@ show qdev device model list
  show roms
  @item info memory-hotplug
  show memory-hotplug
 +@item info memory-total
 +show memory-total

 +++ b/monitor.c
 @@ -2747,6 +2747,13 @@ static mon_cmd_t info_cmds[] = {
  .mhandler.info = hmp_info_memory_hotplug,
  },
  {
 +.name   = memory-total,

We're split on HMP naming conventions ('-' vs. '_'); we have 'show
migrate_capabilities' but 'show block-jobs'.  Oh well, that's life.

Reviewed-by: Eric Blake ebl...@redhat.com

-- 
Eric Blake   ebl...@redhat.com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

[PATCH v5 00/17] Allow changing of Hypervisor CPUIDs.

Also known as Paravirtualization CPUIDs.

This is primarily done so that the guest will think it is running
under vmware when hypervisor-vendor=vmware is specified as a
property of a cpu.

This depends on:

http://lists.gnu.org/archive/html/qemu-devel/2012-09/msg01400.html

As far as I know it is #4. It depends on (1) and (2) and (3).

This change is based on:

Microsoft Hypervisor CPUID Leaves:

http://msdn.microsoft.com/en-us/library/windows/hardware/ff542428%28v=vs.85%29.aspx

Linux kernel change starts with:
http://fixunix.com/kernel/538707-use-cpuid-communicate-hypervisor.html
Also:
http://lkml.indiana.edu/hypermail/linux/kernel/1205.0/00100.html

VMware documention on CPUIDs (Mechanisms to determine if software is
running in a VMware virtual machine):

http://kb.vmware.com/selfservice/microsites/search.do?language=en_UScmd=displayKCexternalId=1009458

Changes from v4 to v5:
Undo kvm_clock2 change.
Add cpuid_hv_level_set; cpuid_hv_level == 0 is now valid.
Add cpuid_hv_vendor_set; the null string is now valid.
Handle kvm and cpuid_hv_level == 0.
hypervisor-vendor=kvm,hypervisor-level=0 and
hypervisor-level=0,hypervisor-vendor=kvm
now do the same thing.

Changes from v3 to v4:
Added CPUID_HV_LEVEL_HYPERV, CPUID_HV_LEVEL_KVM.
Added CPUID_HV_VENDOR_HYPERV.
Added hyperv as known hypservisor-vendor.
Allow hypervisor-level to be 0.

Changes from v2 to v3:
Clean post to qemu-devel.

Changes from v1 to v2:

1) Added 1/4 from
http://lists.gnu.org/archive/html/qemu-devel/2012-08/msg05153.html

Because Fred is changing jobs and so will not be pushing to get
this in. It needed to be rebased, And I needed it to complete the
testing of this change.

2) Added 2/4 because of the re-work I needed a way to clear all KVM bits,

3) The rework of v1. Make it fit into the object model re-work of cpu.c for
x86.

4) Added 3/4 -- The split out of the code that is not needed for accel=kvm.

Changes from v2 to v3:

Marcelo Tosatti:
Its one big patch, better split in logically correlated patches
(with better changelog). This would help reviewers.

So split 3 and 4 into 3 to 17. More info in change log.
No code change.

Don Slutz (17):
target-i386: Allow tsc-frequency to be larger then 2.147G
target-i386: Add missing kvm bits.
target-i386: Add Hypervisor level.
target-i386: Add cpu object access routines for Hypervisor level.
target-i386: Add cpu object access routines for Hypervisor level.
target-i386: Use Hypervisor level in -machine pc,accel=kvm.
target-i386: Use Hypervisor level in -machine pc,accel=tcg.
target-i386: Add Hypervisor vendor.
target-i386: Add cpu object access routines for Hypervisor vendor.
target-i386: Use Hypervisor vendor in -machine pc,accel=kvm.
target-i386: Use Hypervisor vendor in -machine pc,accel=tcg.
target-i386: Add some known names to Hypervisor vendor.
target-i386: Add optional Hypervisor leaf extra.
target-i386: Add cpu object access routines for Hypervisor leaf
extra.
target-i386: Add setting of Hypervisor leaf extra for known vmare4.
target-i386: Use Hypervisor leaf extra in -machine pc,accel=kvm.
target-i386: Use Hypervisor leaf extra in -machine pc,accel=tcg.

target-i386/cpu.c | 285 -
target-i386/cpu.h | 34 +++
target-i386/kvm.c | 33 +-
3 files changed, 341 insertions(+), 11 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

[PATCH v5 01/17] target-i386: Allow tsc-frequency to be larger then 2.147G

The check using INT_MAX (2147483647) is wrong in this case.

Signed-off-by: Fred Oliveira folive...@cloudswitch.com
Signed-off-by: Don Slutz d...@cloudswitch.com
---
 target-i386/cpu.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index af50a8f..0313cf5 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -1146,7 +1146,7 @@ static void x86_cpuid_set_tsc_freq(Object *obj, Visitor 
*v, void *opaque,
 {
 X86CPU *cpu = X86_CPU(obj);
 const int64_t min = 0;
-const int64_t max = INT_MAX;
+const int64_t max = INT64_MAX;
 int64_t value;
 
 visit_type_freq(v, value, name, errp);
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 03/17] target-i386: Add Hypervisor level.

Also known as Paravirtualization level or maximim cpuid function present in 
this leaf.
This is just the EAX value for 0x4000.

QEMU knows this is KVM_CPUID_SIGNATURE (0x4000).

This is based on:

Microsoft Hypervisor CPUID Leaves:
  
http://msdn.microsoft.com/en-us/library/windows/hardware/ff542428%28v=vs.85%29.aspx

Linux kernel change starts with:
  http://fixunix.com/kernel/538707-use-cpuid-communicate-hypervisor.html
Also:
  http://lkml.indiana.edu/hypermail/linux/kernel/1205.0/00100.html

VMware documention on CPUIDs (Mechanisms to determine if software is
running in a VMware virtual machine):
  
http://kb.vmware.com/selfservice/microsites/search.do?language=en_UScmd=displayKCexternalId=1009458

QEMU has the value HYPERV_CPUID_MIN defined.

Signed-off-by: Don Slutz d...@cloudswitch.com
---
 target-i386/cpu.h |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 5265c5a..1899f69 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -777,11 +777,14 @@ typedef struct CPUX86State {
 uint32_t cpuid_ext3_features;
 uint32_t cpuid_apic_id;
 bool cpuid_vendor_override;
+bool cpuid_hv_level_set;
 /* Store the results of Centaur's CPUID instructions */
 uint32_t cpuid_xlevel2;
 uint32_t cpuid_ext4_features;
 /* Flags from CPUID[EAX=7,ECX=0].EBX */
 uint32_t cpuid_7_0_ebx;
+/* Hypervisor CPUIDs */
+uint32_t cpuid_hv_level;
 
 /* MTRRs */
 uint64_t mtrr_fixed[11];
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 02/17] target-i386: Add missing kvm bits.

Fix duplicate name (kvmclock = kvm_clock2) also.

Signed-off-by: Don Slutz d...@cloudswitch.com
---
 target-i386/cpu.c |   12 
 1 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index 0313cf5..25ca986 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -87,10 +87,14 @@ static const char *ext3_feature_name[] = {
 };
 
 static const char *kvm_feature_name[] = {
-kvmclock, kvm_nopiodelay, kvm_mmu, kvmclock, kvm_asyncpf, NULL, 
kvm_pv_eoi, NULL,
-NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+kvmclock, kvm_nopiodelay, kvm_mmu, kvmclock,
+kvm_asyncpf, kvm_steal_time, kvm_pv_eoi, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+kvm_clock_stable, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
 };
 
 static const char *svm_feature_name[] = {
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 04/17] target-i386: Add cpu object access routines for Hypervisor level.

These are modeled after x86_cpuid_get_xlevel and x86_cpuid_set_xlevel.

Signed-off-by: Don Slutz d...@cloudswitch.com
---
 target-i386/cpu.c |   29 +
 1 files changed, 29 insertions(+), 0 deletions(-)

diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index 25ca986..7b31de9 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -1166,6 +1166,32 @@ static void x86_cpuid_set_tsc_freq(Object *obj, Visitor 
*v, void *opaque,
 cpu-env.tsc_khz = value / 1000;
 }
 
+static void x86_cpuid_get_hv_level(Object *obj, Visitor *v, void *opaque,
+const char *name, Error **errp)
+{
+X86CPU *cpu = X86_CPU(obj);
+
+visit_type_uint32(v, cpu-env.cpuid_hv_level, name, errp);
+}
+
+static void x86_cpuid_set_hv_level(Object *obj, Visitor *v, void *opaque,
+const char *name, Error **errp)
+{
+X86CPU *cpu = X86_CPU(obj);
+uint32_t value;
+
+visit_type_uint32(v, value, name, errp);
+if (error_is_set(errp)) {
+return;
+}
+
+if ((value != 0)  (value  0x4000)) {
+value += 0x4000;
+}
+cpu-env.cpuid_hv_level = value;
+cpu-env.cpuid_hv_level_set = true;
+}
+
 #if !defined(CONFIG_USER_ONLY)
 static void x86_get_hv_spinlocks(Object *obj, Visitor *v, void *opaque,
  const char *name, Error **errp)
@@ -2061,6 +2087,9 @@ static void x86_cpu_initfn(Object *obj)
 object_property_add(obj, enforce, bool,
 x86_cpuid_get_enforce,
 x86_cpuid_set_enforce, NULL, NULL, NULL);
+object_property_add(obj, hypervisor-level, int,
+x86_cpuid_get_hv_level,
+x86_cpuid_set_hv_level, NULL, NULL, NULL);
 #if !defined(CONFIG_USER_ONLY)
 object_property_add(obj, hv_spinlocks, int,
 x86_get_hv_spinlocks,
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 05/17] target-i386: Add cpu object access routines for Hypervisor level.

These are modeled after x86_cpuid_get_xlevel and x86_cpuid_set_xlevel.

Signed-off-by: Don Slutz d...@cloudswitch.com
---
 target-i386/cpu.c |8 
 target-i386/cpu.h |2 ++
 2 files changed, 10 insertions(+), 0 deletions(-)

diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index 7b31de9..e5deca2 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -1193,6 +1193,11 @@ static void x86_cpuid_set_hv_level(Object *obj, Visitor 
*v, void *opaque,
 }
 
 #if !defined(CONFIG_USER_ONLY)
+static void x86_set_hyperv(Object *obj, Error **errp)
+{
+object_property_set_int(obj, CPUID_HV_LEVEL_HYPERV, hypervisor-level, 
errp);
+}
+
 static void x86_get_hv_spinlocks(Object *obj, Visitor *v, void *opaque,
  const char *name, Error **errp)
 {
@@ -1215,6 +1220,7 @@ static void x86_set_hv_spinlocks(Object *obj, Visitor *v, 
void *opaque,
 return;
 }
 hyperv_set_spinlock_retries(value);
+x86_set_hyperv(obj, errp);
 }
 
 static void x86_get_hv_relaxed(Object *obj, Visitor *v, void *opaque,
@@ -1235,6 +1241,7 @@ static void x86_set_hv_relaxed(Object *obj, Visitor *v, 
void *opaque,
 return;
 }
 hyperv_enable_relaxed_timing(value);
+x86_set_hyperv(obj, errp);
 }
 
 static void x86_get_hv_vapic(Object *obj, Visitor *v, void *opaque,
@@ -1255,6 +1262,7 @@ static void x86_set_hv_vapic(Object *obj, Visitor *v, 
void *opaque,
 return;
 }
 hyperv_enable_vapic_recommended(value);
+x86_set_hyperv(obj, errp);
 }
 #endif
 
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 1899f69..3152a4e 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -488,6 +488,8 @@
 
 #define CPUID_VENDOR_VIA   CentaurHauls
 
+#define CPUID_HV_LEVEL_HYPERV  0x4005
+
 #define CPUID_MWAIT_IBE (1  1) /* Interrupts can exit capability */
 #define CPUID_MWAIT_EMX (1  0) /* enumeration supported */
 
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 06/17] target-i386: Use Hypervisor level in -machine pc,accel=kvm.