[PATCH v3 0/7] KVM: MMU: fix release pfn in mmu code
Changlog: changes from Avi's comments: - comment for FNAME(fetch) - add annotations (__acquires, __releases) for page_fault_start and page_fault_end changes from Marcelo's comments: - remove mmu_is_invalid - make release noslot pfn path more readable The last patch which introduces page_fault_start and page_fault_end is controversial, i hope we can try it since it wrap the ugly pfn release path up, but i respect your idea. :) Release pfn in the mmu code is little special for we allow no-slot pfn go to spte walk on page fault path, that means, on page fault fail path, we can not directly call kvm_release_pfn_clean. This patchset fixes the bug which release no-slot pfn on fail path and clean up all the paths where kvm_release_pfn_clean is called -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 1/7] KVM: MMU: fix release noslot pfn
We can not directly call kvm_release_pfn_clean to release the pfn since we can meet noslot pfn which is used to cache mmio info into spte Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/mmu.c |6 -- arch/x86/kvm/paging_tmpl.h |6 -- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index aa0b469..0f56169 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2877,7 +2877,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, out_unlock: spin_unlock(vcpu-kvm-mmu_lock); - kvm_release_pfn_clean(pfn); + if (likely(!is_noslot_pfn(pfn))) + kvm_release_pfn_clean(pfn); return 0; } @@ -3345,7 +3346,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, out_unlock: spin_unlock(vcpu-kvm-mmu_lock); - kvm_release_pfn_clean(pfn); + if (likely(!is_noslot_pfn(pfn))) + kvm_release_pfn_clean(pfn); return 0; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index bf8c42b..9ce6bc0 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -544,7 +544,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, out_gpte_changed: if (sp) kvm_mmu_put_page(sp, it.sptep); - kvm_release_pfn_clean(pfn); + if (likely(!is_noslot_pfn(pfn))) + kvm_release_pfn_clean(pfn); return NULL; } @@ -645,7 +646,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, out_unlock: spin_unlock(vcpu-kvm-mmu_lock); - kvm_release_pfn_clean(pfn); + if (likely(!is_noslot_pfn(pfn))) + kvm_release_pfn_clean(pfn); return 0; } -- 1.7.7.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 2/7] KVM: MMU: remove mmu_is_invalid
Remove mmu_is_invalid and use is_invalid_pfn instead Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/mmu.c |5 - arch/x86/kvm/paging_tmpl.h |4 ++-- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0f56169..3e9728b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2700,11 +2700,6 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, } } -static bool mmu_invalid_pfn(pfn_t pfn) -{ - return unlikely(is_invalid_pfn(pfn)); -} - static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, pfn_t pfn, unsigned access, int *ret_val) { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 9ce6bc0..b400761 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -370,7 +370,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, pgprintk(%s: gpte %llx spte %p\n, __func__, (u64)gpte, spte); pte_access = sp-role.access FNAME(gpte_access)(vcpu, gpte, true); pfn = gfn_to_pfn_atomic(vcpu-kvm, gpte_to_gfn(gpte)); - if (mmu_invalid_pfn(pfn)) + if (is_invalid_pfn(pfn)) return; /* @@ -446,7 +446,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, gfn = gpte_to_gfn(gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, pte_access ACC_WRITE_MASK); - if (mmu_invalid_pfn(pfn)) + if (is_invalid_pfn(pfn)) break; mmu_set_spte(vcpu, spte, sp-role.access, pte_access, 0, 0, -- 1.7.7.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 3/7] KVM: MMU: do not release pfn in mmu_set_spte
It helps us to cleanup release pfn in the later patches Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/mmu.c | 29 ++--- arch/x86/kvm/paging_tmpl.h | 18 -- 2 files changed, 26 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3e9728b..bc1cda4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2496,9 +2496,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, rmap_recycle(vcpu, sptep, gfn); } } - - if (!is_error_pfn(pfn)) - kvm_release_pfn_clean(pfn); } static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) @@ -2535,12 +2532,15 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, if (ret = 0) return -1; - for (i = 0; i ret; i++, gfn++, start++) + for (i = 0; i ret; i++, gfn++, start++) { mmu_set_spte(vcpu, start, ACC_ALL, access, 0, 0, NULL, sp-role.level, gfn, page_to_pfn(pages[i]), true, true); + kvm_release_page_clean(pages[i]); + } + return 0; } @@ -2858,23 +2858,22 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, return r; spin_lock(vcpu-kvm-mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) + if (mmu_notifier_retry(vcpu, mmu_seq)) { + r = 0; goto out_unlock; + } + kvm_mmu_free_some_pages(vcpu); if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, gfn, pfn, level); r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, prefault); - spin_unlock(vcpu-kvm-mmu_lock); - - - return r; out_unlock: spin_unlock(vcpu-kvm-mmu_lock); if (likely(!is_noslot_pfn(pfn))) kvm_release_pfn_clean(pfn); - return 0; + return r; } @@ -3328,22 +3327,22 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, return r; spin_lock(vcpu-kvm-mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) + if (mmu_notifier_retry(vcpu, mmu_seq)) { + r = 0; goto out_unlock; + } + kvm_mmu_free_some_pages(vcpu); if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, gfn, pfn, level); r = __direct_map(vcpu, gpa, write, map_writable, level, gfn, pfn, prefault); - spin_unlock(vcpu-kvm-mmu_lock); - - return r; out_unlock: spin_unlock(vcpu-kvm-mmu_lock); if (likely(!is_noslot_pfn(pfn))) kvm_release_pfn_clean(pfn); - return 0; + return r; } static void nonpaging_free(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index b400761..56f8085 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -380,6 +380,9 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, mmu_set_spte(vcpu, spte, sp-role.access, pte_access, 0, 0, NULL, PT_PAGE_TABLE_LEVEL, gpte_to_gfn(gpte), pfn, true, true); + + if (likely(!is_noslot_pfn(pfn))) + kvm_release_pfn_clean(pfn); } static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, @@ -452,6 +455,9 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, mmu_set_spte(vcpu, spte, sp-role.access, pte_access, 0, 0, NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true); + + if (likely(!is_noslot_pfn(pfn))) + kvm_release_pfn_clean(pfn); } } @@ -544,8 +550,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, out_gpte_changed: if (sp) kvm_mmu_put_page(sp, it.sptep); - if (likely(!is_noslot_pfn(pfn))) - kvm_release_pfn_clean(pfn); + return NULL; } @@ -625,8 +630,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, return r; spin_lock(vcpu-kvm-mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) + if (mmu_notifier_retry(vcpu, mmu_seq)) { + r = 0; goto out_unlock; + } kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); kvm_mmu_free_some_pages(vcpu); @@ -640,15 +647,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, ++vcpu-stat.pf_fixed; kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); - spin_unlock(vcpu-kvm-mmu_lock); - return emulate; + r = emulate; out_unlock: spin_unlock(vcpu-kvm-mmu_lock); if
[PATCH v3 4/7] KVM: MMU: cleanup FNAME(page_fault)
Let it return emulate state instead of spte like __direct_map Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/paging_tmpl.h | 31 --- 1 files changed, 12 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 56f8085..6fd1c8b 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -463,21 +463,21 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, /* * Fetch a shadow pte for a specific level in the paging hierarchy. + * If the guest tries to write a write-protected page, we need to + * emulate this operation, return 1 to indicate this case. */ -static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, +static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, int user_fault, int write_fault, int hlevel, -int *emulate, pfn_t pfn, bool map_writable, -bool prefault) +pfn_t pfn, bool map_writable, bool prefault) { - unsigned access = gw-pt_access; struct kvm_mmu_page *sp = NULL; - int top_level; - unsigned direct_access; struct kvm_shadow_walk_iterator it; + unsigned direct_access, access = gw-pt_access; + int top_level, emulate = 0; if (!is_present_gpte(gw-ptes[gw-level - 1])) - return NULL; + return 0; direct_access = gw-pte_access; @@ -541,17 +541,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, clear_sp_write_flooding_count(it.sptep); mmu_set_spte(vcpu, it.sptep, access, gw-pte_access, -user_fault, write_fault, emulate, it.level, +user_fault, write_fault, emulate, it.level, gw-gfn, pfn, prefault, map_writable); FNAME(pte_prefetch)(vcpu, gw, it.sptep); - return it.sptep; + return emulate; out_gpte_changed: if (sp) kvm_mmu_put_page(sp, it.sptep); - return NULL; + return 0; } /* @@ -574,8 +574,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int write_fault = error_code PFERR_WRITE_MASK; int user_fault = error_code PFERR_USER_MASK; struct guest_walker walker; - u64 *sptep; - int emulate = 0; int r; pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; @@ -639,17 +637,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, kvm_mmu_free_some_pages(vcpu); if (!force_pt_level) transparent_hugepage_adjust(vcpu, walker.gfn, pfn, level); - sptep = FNAME(fetch)(vcpu, addr, walker, user_fault, write_fault, -level, emulate, pfn, map_writable, prefault); - (void)sptep; - pgprintk(%s: shadow pte %p %llx emulate %d\n, __func__, -sptep, *sptep, emulate); + r = FNAME(fetch)(vcpu, addr, walker, user_fault, write_fault, +level, pfn, map_writable, prefault); ++vcpu-stat.pf_fixed; kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); - r = emulate; - out_unlock: spin_unlock(vcpu-kvm-mmu_lock); if (likely(!is_noslot_pfn(pfn))) -- 1.7.7.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 5/7] KVM: MMU: introduce FNAME(prefetch_gpte)
The only difference between FNAME(update_pte) and FNAME(pte_prefetch) is that the former is allowed to prefetch gfn from dirty logged slot, so introduce a common function to prefetch spte Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/paging_tmpl.h | 58 ++- 1 files changed, 24 insertions(+), 34 deletions(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 6fd1c8b..5b1af72 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -356,33 +356,45 @@ no_present: return true; } -static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, const void *pte) +static bool +FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, +u64 *spte, pt_element_t gpte, bool no_dirty_log) { - pt_element_t gpte; unsigned pte_access; + gfn_t gfn; pfn_t pfn; - gpte = *(const pt_element_t *)pte; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) - return; + return false; pgprintk(%s: gpte %llx spte %p\n, __func__, (u64)gpte, spte); + + gfn = gpte_to_gfn(gpte); pte_access = sp-role.access FNAME(gpte_access)(vcpu, gpte, true); - pfn = gfn_to_pfn_atomic(vcpu-kvm, gpte_to_gfn(gpte)); + pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, + no_dirty_log (pte_access ACC_WRITE_MASK)); if (is_invalid_pfn(pfn)) - return; + return false; /* -* we call mmu_set_spte() with host_writable = true because that -* vcpu-arch.update_pte.pfn was fetched from get_user_pages(write = 1). +* we call mmu_set_spte() with host_writable = true because +* pte_prefetch_gfn_to_pfn always gets a writable pfn. */ mmu_set_spte(vcpu, spte, sp-role.access, pte_access, 0, 0, -NULL, PT_PAGE_TABLE_LEVEL, -gpte_to_gfn(gpte), pfn, true, true); +NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true); if (likely(!is_noslot_pfn(pfn))) kvm_release_pfn_clean(pfn); + + return true; +} + +static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + u64 *spte, const void *pte) +{ + pt_element_t gpte = *(const pt_element_t *)pte; + + FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false); } static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, @@ -428,36 +440,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, spte = sp-spt + i; for (i = 0; i PTE_PREFETCH_NUM; i++, spte++) { - pt_element_t gpte; - unsigned pte_access; - gfn_t gfn; - pfn_t pfn; - if (spte == sptep) continue; if (is_shadow_present_pte(*spte)) continue; - gpte = gptep[i]; - - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) - continue; - - pte_access = sp-role.access FNAME(gpte_access)(vcpu, gpte, - true); - gfn = gpte_to_gfn(gpte); - pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, - pte_access ACC_WRITE_MASK); - if (is_invalid_pfn(pfn)) + if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true)) break; - - mmu_set_spte(vcpu, spte, sp-role.access, pte_access, 0, 0, -NULL, PT_PAGE_TABLE_LEVEL, gfn, -pfn, true, true); - - if (likely(!is_noslot_pfn(pfn))) - kvm_release_pfn_clean(pfn); } } -- 1.7.7.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 6/7] KVM: MMU: move prefetch_invalid_gpte out of pagaing_tmp.h
The function does not depend on guest mmu mode, move it out from paging_tmpl.h Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/mmu.c | 36 arch/x86/kvm/paging_tmpl.h | 24 ++-- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index bc1cda4..a455c0d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2503,6 +2503,14 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) mmu_free_roots(vcpu); } +static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) +{ + int bit7; + + bit7 = (gpte 7) 1; + return (gpte mmu-rsvd_bits_mask[bit7][level-1]) != 0; +} + static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { @@ -2515,6 +2523,26 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, return gfn_to_pfn_memslot_atomic(slot, gfn); } +static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *spte, + u64 gpte) +{ + if (is_rsvd_bits_set(vcpu-arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + goto no_present; + + if (!is_present_gpte(gpte)) + goto no_present; + + if (!(gpte PT_ACCESSED_MASK)) + goto no_present; + + return false; + +no_present: + drop_spte(vcpu-kvm, spte); + return true; +} + static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *start, u64 *end) @@ -3396,14 +3424,6 @@ static void paging_free(struct kvm_vcpu *vcpu) nonpaging_free(vcpu); } -static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) -{ - int bit7; - - bit7 = (gpte 7) 1; - return (gpte mmu-rsvd_bits_mask[bit7][level-1]) != 0; -} - static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, int *nr_present) { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 5b1af72..298e5c2 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -336,26 +336,6 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker, addr, access); } -static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *spte, - pt_element_t gpte) -{ - if (is_rsvd_bits_set(vcpu-arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) - goto no_present; - - if (!is_present_gpte(gpte)) - goto no_present; - - if (!(gpte PT_ACCESSED_MASK)) - goto no_present; - - return false; - -no_present: - drop_spte(vcpu-kvm, spte); - return true; -} - static bool FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, pt_element_t gpte, bool no_dirty_log) @@ -364,7 +344,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, gfn_t gfn; pfn_t pfn; - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) + if (prefetch_invalid_gpte(vcpu, sp, spte, gpte)) return false; pgprintk(%s: gpte %llx spte %p\n, __func__, (u64)gpte, spte); @@ -778,7 +758,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) sizeof(pt_element_t))) return -EINVAL; - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, sp-spt[i], gpte)) { + if (prefetch_invalid_gpte(vcpu, sp, sp-spt[i], gpte)) { vcpu-kvm-tlbs_dirty++; continue; } -- 1.7.7.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 7/7] KVM: MMU: introduce page_fault_start/page_fault_end
Wrap the common operations into these two functions Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/mmu.c | 55 arch/x86/kvm/paging_tmpl.h | 12 - 2 files changed, 40 insertions(+), 27 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a455c0d..d75bf9a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2848,6 +2848,31 @@ exit: static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gva_t gva, pfn_t *pfn, bool write, bool *writable); +static bool +page_fault_start(struct kvm_vcpu *vcpu, gfn_t *gfnp, pfn_t *pfnp, int *levelp, +bool force_pt_level, unsigned long mmu_seq) + __acquires(vcpu-kvm-mmu_lock) +{ + spin_lock(vcpu-kvm-mmu_lock); + if (mmu_notifier_retry(vcpu, mmu_seq)) + return false; + + kvm_mmu_free_some_pages(vcpu); + if (likely(!force_pt_level)) + transparent_hugepage_adjust(vcpu, gfnp, pfnp, levelp); + + return true; +} + +static void page_fault_end(struct kvm_vcpu *vcpu, pfn_t pfn) + __releases(vcpu-kvm-mmu_lock) +{ + spin_unlock(vcpu-kvm-mmu_lock); + + if (likely(!is_noslot_pfn(pfn))) + kvm_release_pfn_clean(pfn); +} + static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, gfn_t gfn, bool prefault) { @@ -2885,22 +2910,17 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, r)) return r; - spin_lock(vcpu-kvm-mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) { + if (!page_fault_start(vcpu, gfn, pfn, level, force_pt_level, + mmu_seq)) { r = 0; - goto out_unlock; + goto exit; } - kvm_mmu_free_some_pages(vcpu); - if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, gfn, pfn, level); r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, prefault); -out_unlock: - spin_unlock(vcpu-kvm-mmu_lock); - if (likely(!is_noslot_pfn(pfn))) - kvm_release_pfn_clean(pfn); +exit: + page_fault_end(vcpu, pfn); return r; } @@ -3354,22 +3374,17 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, r)) return r; - spin_lock(vcpu-kvm-mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) { + if (!page_fault_start(vcpu, gfn, pfn, level, force_pt_level, + mmu_seq)) { r = 0; - goto out_unlock; + goto exit; } - kvm_mmu_free_some_pages(vcpu); - if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, gfn, pfn, level); r = __direct_map(vcpu, gpa, write, map_writable, level, gfn, pfn, prefault); -out_unlock: - spin_unlock(vcpu-kvm-mmu_lock); - if (likely(!is_noslot_pfn(pfn))) - kvm_release_pfn_clean(pfn); +exit: + page_fault_end(vcpu, pfn); return r; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 298e5c2..269116d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -597,10 +597,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, walker.gfn, pfn, walker.pte_access, r)) return r; - spin_lock(vcpu-kvm-mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) { + if (!page_fault_start(vcpu, walker.gfn, pfn, level, + force_pt_level, mmu_seq)) { r = 0; - goto out_unlock; + goto exit; } kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); @@ -613,10 +613,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, ++vcpu-stat.pf_fixed; kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); -out_unlock: - spin_unlock(vcpu-kvm-mmu_lock); - if (likely(!is_noslot_pfn(pfn))) - kvm_release_pfn_clean(pfn); +exit: + page_fault_end(vcpu, pfn); return r; } -- 1.7.7.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 02/17] target-i386: Add missing kvm bits.
On Thu, 20 Sep 2012 16:03:17 -0400 Don Slutz d...@cloudswitch.com wrote: Fix duplicate name (kvmclock = kvm_clock2) also. Signed-off-by: Don Slutz d...@cloudswitch.com --- target-i386/cpu.c | 12 1 files changed, 8 insertions(+), 4 deletions(-) diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 0313cf5..5f9866a 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -87,10 +87,14 @@ static const char *ext3_feature_name[] = { }; static const char *kvm_feature_name[] = { -kvmclock, kvm_nopiodelay, kvm_mmu, kvmclock, kvm_asyncpf, NULL, kvm_pv_eoi, NULL, -NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, -NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, -NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +kvmclock, kvm_nopiodelay, kvm_mmu, kvm_clock2, before patch if kvmclock is specified it would set 0 and 3 bits, after patch only bit 0 is set. Is it correct/expected behavior? if yes, please add rationale into patch description. +kvm_asyncpf, kvm_steal_time, kvm_pv_eoi, NULL, +NULL, NULL, NULL, NULL, +NULL, NULL, NULL, NULL, +NULL, NULL, NULL, NULL, +NULL, NULL, NULL, NULL, +kvm_clock_stable, NULL, NULL, NULL, +NULL, NULL, NULL, NULL, }; static const char *svm_feature_name[] = { -- 1.7.1 -- Regards, Igor -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH v3] kvm/fpu: Enable fully eager restore kvm FPU
-Original Message- From: Avi Kivity [mailto:a...@redhat.com] Sent: Thursday, September 20, 2012 5:20 PM To: Hao, Xudong Cc: Marcelo Tosatti; kvm@vger.kernel.org; Zhang, Xiantao Subject: Re: [PATCH v3] kvm/fpu: Enable fully eager restore kvm FPU On guest entry: if (!lazy_fpu_allowed(vcpu)) kvm_x86_ops-fpu_activate(vcpu); But we already have that: if (vcpu-fpu_active) kvm_load_guest_fpu(vcpu); so why not manage fpu_active to be always set when needed? I don't want more checks in the entry path. I means add fpu_active() in kvm_set_xcr(), not in guest entry. Then the fpu_active will be set always when guest initialize xstate. @@ -574,6 +574,9 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) kvm_inject_gp(vcpu, 0); return 1; } + if (!lazy_fpu_allowed(vcpu)) + kvm_x86_ops-fpu_activate(vcpu); return 0; And of course on cr4 update. So a function update_lazy_fpu() to be called from both places is needed. Complete consideration, thanks. So I will define a function update_lazy_fpu(), insert it into kvm_set_xcr() and handle_cr(). Comments? -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/6] Reduce compaction scanning and lock contention
Hi Mel, Thank you for this series. I have applied on clean 3.6-rc5 and tested, and it works well for me - the lock contention is (still) gone and isolate_freepages_block is much reduced. Here is a typical test with these patches: # grep -F '[k]' report | head -8 65.20% qemu-kvm [kernel.kallsyms] [k] clear_page_c 2.18% qemu-kvm [kernel.kallsyms] [k] isolate_freepages_block 1.56% qemu-kvm [kernel.kallsyms] [k] _raw_spin_lock 1.40% qemu-kvm [kernel.kallsyms] [k] svm_vcpu_run 1.38% swapper [kernel.kallsyms] [k] default_idle 1.35% qemu-kvm [kernel.kallsyms] [k] get_page_from_freelist 0.74% ksmd [kernel.kallsyms] [k] memcmp 0.72% qemu-kvm [kernel.kallsyms] [k] free_pages_prepare I did manage to get a couple which were slightly worse, but nothing like as bad as before. Here are the results: # grep -F '[k]' report | head -8 45.60% qemu-kvm [kernel.kallsyms] [k] clear_page_c 11.26% qemu-kvm [kernel.kallsyms] [k] isolate_freepages_block 3.21% qemu-kvm [kernel.kallsyms] [k] _raw_spin_lock 2.27% ksmd [kernel.kallsyms] [k] memcmp 2.02%swapper [kernel.kallsyms] [k] default_idle 1.58% qemu-kvm [kernel.kallsyms] [k] svm_vcpu_run 1.30% qemu-kvm [kernel.kallsyms] [k] _raw_spin_lock_irqsave 1.09% qemu-kvm [kernel.kallsyms] [k] get_page_from_freelist # grep -F '[k]' report | head -8 61.29% qemu-kvm [kernel.kallsyms] [k] clear_page_c 4.52% qemu-kvm [kernel.kallsyms] [k] _raw_spin_lock_irqsave 2.64% qemu-kvm [kernel.kallsyms] [k] copy_page_c 1.61%swapper [kernel.kallsyms] [k] default_idle 1.57% qemu-kvm [kernel.kallsyms] [k] _raw_spin_lock 1.18% qemu-kvm [kernel.kallsyms] [k] get_page_from_freelist 1.18% qemu-kvm [kernel.kallsyms] [k] isolate_freepages_block 1.11% qemu-kvm [kernel.kallsyms] [k] svm_vcpu_run I will follow up with the detailed traces for these three tests. Thank you! Richard. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 1/2] KVM: PPC: Book3S: Get/set guest SPRs using the GET/SET_ONE_REG interface
On 21.09.2012, at 07:44, Paul Mackerras wrote: This enables userspace to get and set various SPRs (special-purpose registers) using the KVM_[GS]ET_ONE_REG ioctls. With this, userspace can get and set all the SPRs that are part of the guest state, either through the KVM_[GS]ET_REGS ioctls, the KVM_[GS]ET_SREGS ioctls, or the KVM_[GS]ET_ONE_REG ioctls. The SPRs that are added here are: - DABR: Data address breakpoint register - DSCR: Data stream control register - PURR: Processor utilization of resources register - SPURR: Scaled PURR - DAR: Data address register - DSISR: Data storage interrupt status register - AMR: Authority mask register - UAMOR: User authority mask override register - MMCR0, MMCR1, MMCRA: Performance monitor unit control registers - PMC1..PMC8: Performance monitor unit counter registers In order to reduce code duplication between PR and HV KVM code, this moves the kvm_vcpu_ioctl_[gs]et_one_reg functions into book3s.c and centralizes the copying between user and kernel space there. The registers that are handled differently between PR and HV, and those that exist only in one flavor, are handled in kvmppc_[gs]et_one_reg() functions that are specific to each flavor. Signed-off-by: Paul Mackerras pau...@samba.org --- v3: handle DAR and DSISR, plus copy to/from userspace, in common code Documentation/virtual/kvm/api.txt | 19 + arch/powerpc/include/asm/kvm.h | 21 ++ arch/powerpc/include/asm/kvm_ppc.h |7 arch/powerpc/kvm/book3s.c | 74 +++ arch/powerpc/kvm/book3s_hv.c | 76 ++-- arch/powerpc/kvm/book3s_pr.c | 23 ++- 6 files changed, 196 insertions(+), 24 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 76a07a6..e4a2067 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1740,6 +1740,25 @@ registers, find a list below: PPC | KVM_REG_PPC_IAC4 | 64 PPC | KVM_REG_PPC_DAC1 | 64 PPC | KVM_REG_PPC_DAC2 | 64 + PPC | KVM_REG_PPC_DABR | 64 + PPC | KVM_REG_PPC_DSCR | 64 + PPC | KVM_REG_PPC_PURR | 64 + PPC | KVM_REG_PPC_SPURR | 64 + PPC | KVM_REG_PPC_DAR | 64 + PPC | KVM_REG_PPC_DSISR | 32 + PPC | KVM_REG_PPC_AMR | 64 + PPC | KVM_REG_PPC_UAMOR | 64 + PPC | KVM_REG_PPC_MMCR0 | 64 + PPC | KVM_REG_PPC_MMCR1 | 64 + PPC | KVM_REG_PPC_MMCRA | 64 + PPC | KVM_REG_PPC_PMC1 | 32 + PPC | KVM_REG_PPC_PMC2 | 32 + PPC | KVM_REG_PPC_PMC3 | 32 + PPC | KVM_REG_PPC_PMC4 | 32 + PPC | KVM_REG_PPC_PMC5 | 32 + PPC | KVM_REG_PPC_PMC6 | 32 + PPC | KVM_REG_PPC_PMC7 | 32 + PPC | KVM_REG_PPC_PMC8 | 32 4.69 KVM_GET_ONE_REG diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index 3c14202..9557576 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -338,5 +338,26 @@ struct kvm_book3e_206_tlb_params { #define KVM_REG_PPC_IAC4 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x5) #define KVM_REG_PPC_DAC1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x6) #define KVM_REG_PPC_DAC2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x7) +#define KVM_REG_PPC_DABR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8) +#define KVM_REG_PPC_DSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x9) +#define KVM_REG_PPC_PURR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa) +#define KVM_REG_PPC_SPURR(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb) +#define KVM_REG_PPC_DAR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc) +#define KVM_REG_PPC_DSISR(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xd) +#define KVM_REG_PPC_AMR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xe) +#define KVM_REG_PPC_UAMOR(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xf) + +#define KVM_REG_PPC_MMCR0(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x10) +#define KVM_REG_PPC_MMCR1(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x11) +#define KVM_REG_PPC_MMCRA(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12) + +#define KVM_REG_PPC_PMC1 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x18) +#define KVM_REG_PPC_PMC2 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x19) +#define KVM_REG_PPC_PMC3 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1a) +#define KVM_REG_PPC_PMC4 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1b) +#define KVM_REG_PPC_PMC5 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1c) +#define KVM_REG_PPC_PMC6 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1d) +#define KVM_REG_PPC_PMC7 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1e) +#define KVM_REG_PPC_PMC8 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1f) #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2c94cb3..6002b0a 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -196,6 +196,11
[Autotest][PATCH 1/5] Autotest: Add utils for OpenVSwitch patch
ForAllxx: run object method on every object in list ForAll[a,b,c].print() Signed-off-by: Jiří Župka jzu...@redhat.com --- client/shared/base_utils.py | 81 +++--- 1 files changed, 67 insertions(+), 14 deletions(-) diff --git a/client/shared/base_utils.py b/client/shared/base_utils.py index 0734742..573b907 100644 --- a/client/shared/base_utils.py +++ b/client/shared/base_utils.py @@ -1224,6 +1224,56 @@ def system_output_parallel(commands, timeout=None, ignore_status=False, return out +class ForAll(list): +def __getattr__(self, name): +def wrapper(*args, **kargs): +return map(lambda o: o.__getattribute__(name)(*args, **kargs), self) +return wrapper + + +class ForAllP(list): + +Parallel version of ForAll + +def __getattr__(self, name): +def wrapper(*args, **kargs): +threads = [] +for o in self: +threads.append(InterruptedThread(o.__getattribute__(name), +args=args, kwargs=kargs)) +for t in threads: +t.start() +return map(lambda t: t.join(), threads) +return wrapper + + +class ForAllPSE(list): + +Parallel version of and suppress exception. + +def __getattr__(self, name): +def wrapper(*args, **kargs): +threads = [] +for o in self: +threads.append(InterruptedThread(o.__getattribute__(name), +args=args, kwargs=kargs)) +for t in threads: +t.start() + +result = [] +for t in threads: +ret = {} +try: +ret[return] = t.join() +except Exception: +ret[exception] = sys.exc_info() +ret[args] = args +ret[kargs] = kargs +result.append(ret) +return result +return wrapper + + def etraceback(prep, exc_info): Enhanced Traceback formats traceback into lines prep: line\nname: line @@ -1733,9 +1783,12 @@ def import_site_function(path, module, funcname, dummy, modulefile=None): return import_site_symbol(path, module, funcname, dummy, modulefile) -def _get_pid_path(program_name): -pid_files_dir = GLOBAL_CONFIG.get_config_value(SERVER, 'pid_files_dir', - default=) +def get_pid_path(program_name, pid_files_dir=None): +if pid_files_dir is None: +pid_files_dir = GLOBAL_CONFIG.get_config_value(SERVER, + 'pid_files_dir', + default=) + if not pid_files_dir: base_dir = os.path.dirname(__file__) pid_path = os.path.abspath(os.path.join(base_dir, .., .., @@ -1746,25 +1799,25 @@ def _get_pid_path(program_name): return pid_path -def write_pid(program_name): +def write_pid(program_name, pid_files_dir=None): Try to drop program_name.pid in the main autotest directory. Args: program_name: prefix for file name -pidfile = open(_get_pid_path(program_name), w) +pidfile = open(get_pid_path(program_name, pid_files_dir), w) try: pidfile.write(%s\n % os.getpid()) finally: pidfile.close() -def delete_pid_file_if_exists(program_name): +def delete_pid_file_if_exists(program_name, pid_files_dir=None): Tries to remove program_name.pid from the main autotest directory. -pidfile_path = _get_pid_path(program_name) +pidfile_path = get_pid_path(program_name, pid_files_dir) try: os.remove(pidfile_path) @@ -1774,18 +1827,18 @@ def delete_pid_file_if_exists(program_name): raise -def get_pid_from_file(program_name): +def get_pid_from_file(program_name, pid_files_dir=None): Reads the pid from program_name.pid in the autotest directory. @param program_name the name of the program @return the pid if the file exists, None otherwise. -pidfile_path = _get_pid_path(program_name) +pidfile_path = get_pid_path(program_name, pid_files_dir) if not os.path.exists(pidfile_path): return None -pidfile = open(_get_pid_path(program_name), 'r') +pidfile = open(get_pid_path(program_name, pid_files_dir), 'r') try: try: @@ -1808,27 +1861,27 @@ def get_process_name(pid): return get_field(read_file(/proc/%d/stat % pid), 1)[1:-1] -def program_is_alive(program_name): +def program_is_alive(program_name, pid_files_dir=None): Checks if the process is alive and not in Zombie state. @param program_name the name of the program @return True if still alive, False otherwise -pid = get_pid_from_file(program_name) +pid = get_pid_from_file(program_name, pid_files_dir) if pid is None:
[Autotest][PATCH 2/5] virt: Adds OpenVSwitch support to virt tests.
When autotest tries add tap to bridge then test recognize if test is bridge is standard linux or OpenVSwitch. And adds some utils for bridge manipulation. Signed-off-by: Jiří Župka jzu...@redhat.com --- client/shared/openvswitch.py | 583 ++ client/tests/virt/virttest/utils_misc.py | 473 +++- 2 files changed, 1042 insertions(+), 14 deletions(-) create mode 100644 client/shared/openvswitch.py diff --git a/client/shared/openvswitch.py b/client/shared/openvswitch.py new file mode 100644 index 000..98dc5e9 --- /dev/null +++ b/client/shared/openvswitch.py @@ -0,0 +1,583 @@ +import logging, re, os, select, signal +try: +import autotest.common as common +except ImportError: +import common +from autotest.client import utils, package +from autotest.client.shared import error +from autotest.client.shared.base_utils import VersionableClass + + +class ServiceManagerInterface(VersionableClass): +def __new__(cls, *args, **kargs): +ServiceManagerInterface.master_class = ServiceManagerInterface +return super(ServiceManagerInterface, cls).__new__(cls, *args, **kargs) + +@classmethod +def get_version(cls): + +Get version of ServiceManager. +@return: Version of ServiceManager. + +return open(/proc/1/comm, r).read().strip() + +def stop(self, service_name): +raise NotImplementedError(Method 'stop' must be + implemented in child class) + + +def start(self, service_name): +raise NotImplementedError(Method 'start' must be + implemented in child class) + + +def restart(self, service_name): +raise NotImplementedError(Method 'restart' must be + implemented in child class) + + +def status(self, service_name): +raise NotImplementedError(Method 'status' must be + implemented in child class) + + +class ServiceManagerSystemD(ServiceManagerInterface, VersionableClass): +@classmethod +def is_right_version(cls, version): +if version == systemd: +return True +return False + +def stop(self, service_name): +utils.run(systemctl stop %s.service % (service_name)) + + +def start(self, service_name): +utils.run(systemctl start %s.service % (service_name)) + + +def restart(self, service_name): +utils.run(systemctl restart %s.service % (service_name)) + + +def status(self, service_name): +utils.run(systemctl show %s.service % (service_name)) + + +class ServiceManagerSysvinit(ServiceManagerInterface, VersionableClass): +@classmethod +def is_right_version(cls, version): +if version == init: +return True +return False + + +def stop(self, service_name): +utils.run(/etc/init.d/%s stop % (service_name)) + + +def start(self, service_name): +utils.run(/etc/init.d/%s start % (service_name)) + + +def restart(self, service_name): +utils.run(/etc/init.d/%s restart % (service_name)) + + +class ServiceManager(ServiceManagerInterface): +pass + + +class PathComplete(object): +def __init__(self, bindir): +self.bindir = bindir + + +def __call__(self, path): +return os.path.join(self.bindir, path) + + +class OpenVSwitchControl(object): + +Class select the best matches control class for installed version +of OpenVSwitch. + +OpenVSwtich parameters are described in man ovs-vswitchd.conf.db + +def __new__(cls, db_path=None, db_socket=None, db_pidfile=None, + ovs_pidfile=None, dbschema=None, install_prefix=None): + +Makes initialization of OpenVSwitch. + +@param tmpdir: Tmp directory for save openvswitch test files. +@param db_path: Path of OVS database. +@param db_socket: Path of OVS db socket. +@param db_pidfile: Path of OVS db ovsdb-server pid. +@param ovs_pidfile: Path of OVS ovs-vswitchd pid. +@param install_prefix: Path where is openvswitch installed. + +# if path is None set default path. +if not install_prefix: +install_prefix = / +if not db_path: +db_path = os.path.join(install_prefix, + /etc/openvswitch/conf.db) +if not db_socket: +db_socket = os.path.join(install_prefix, + /var/run/openvswitch/db.sock) +if not db_pidfile: +db_pidfile = os.path.join(install_prefix, + /var/run/openvswitch/ovsdb-server.pid) +if not ovs_pidfile: +ovs_pidfile = os.path.join(install_prefix, + /var/run/openvswitch/ovs-vswitchd.pid) +if not dbschema: +dbschema = os.path.join(install_prefix, +
[Autotest][PATCH 3/5] virt: Adds functionality for vms.
Allow creating of machine with tap devices which are not connected to bridge. Add function for fill virtnet object with address. Signed-off-by: Jiří Župka jzu...@redhat.com --- client/tests/virt/virttest/kvm_vm.py |9 + client/tests/virt/virttest/utils_misc.py |3 ++- client/tests/virt/virttest/virt_vm.py| 20 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/client/tests/virt/virttest/kvm_vm.py b/client/tests/virt/virttest/kvm_vm.py index 247f0eb..d3b4ede 100644 --- a/client/tests/virt/virttest/kvm_vm.py +++ b/client/tests/virt/virttest/kvm_vm.py @@ -949,7 +949,7 @@ class VM(virt_vm.BaseVM): qemu_cmd += add_name(hlp, name) # no automagic devices please defaults = params.get(defaults, no) -if has_option(hlp,nodefaults) and defaults != yes: +if has_option(hlp, nodefaults) and defaults != yes: qemu_cmd += -nodefaults # Add monitors for monitor_name in params.objects(monitors): @@ -1065,7 +1065,7 @@ class VM(virt_vm.BaseVM): for nic in vm.virtnet: # setup nic parameters as needed -nic = vm.add_nic(**dict(nic)) # add_netdev if netdev_id not set +nic = vm.add_nic(**dict(nic)) # add_netdev if netdev_id not set # gather set values or None if unset vlan = int(nic.get('vlan')) netdev_id = nic.get('netdev_id') @@ -2064,7 +2064,7 @@ class VM(virt_vm.BaseVM): nic.set_if_none('nettype', 'bridge') if nic.nettype == 'bridge': # implies tap # destination is required, hard-code reasonable default if unset -nic.set_if_none('netdst', 'virbr0') +# nic.set_if_none('netdst', 'virbr0') # tapfd allocated/set in activate because requires system resources nic.set_if_none('tapfd_id', utils_misc.generate_random_id()) elif nic.nettype == 'user': @@ -2142,7 +2142,8 @@ class VM(virt_vm.BaseVM): error.context(Raising bridge for + msg_sfx + attach_cmd, logging.debug) # assume this will puke if netdst unset -utils_misc.add_to_bridge(nic.ifname, nic.netdst) +if not nic.netdst is None: +utils_misc.add_to_bridge(nic.ifname, nic.netdst) elif nic.nettype == 'user': attach_cmd += user,name=%s % nic.ifname else: # unsupported nettype diff --git a/client/tests/virt/virttest/utils_misc.py b/client/tests/virt/virttest/utils_misc.py index e416551..b4799e7 100644 --- a/client/tests/virt/virttest/utils_misc.py +++ b/client/tests/virt/virttest/utils_misc.py @@ -719,7 +719,8 @@ class VirtIface(PropCan): Networking information for single guest interface and host connection. -__slots__ = ['nic_name', 'mac', 'nic_model', 'ip', 'nettype', 'netdst'] +__slots__ = ['nic_name', 'g_nic_name', 'mac', 'nic_model', 'ip', + 'nettype', 'netdst'] # Make sure first byte generated is always zero and it follows # the class definition. This helps provide more predictable # addressing while avoiding clashes between multiple NICs. diff --git a/client/tests/virt/virttest/virt_vm.py b/client/tests/virt/virttest/virt_vm.py index 79dd08b..ff6a0f7 100644 --- a/client/tests/virt/virttest/virt_vm.py +++ b/client/tests/virt/virttest/virt_vm.py @@ -518,6 +518,7 @@ class BaseVM(object): # Make sure the IP address is assigned to one or more macs # for this guest macs = self.virtnet.mac_list() + if not utils_misc.verify_ip_address_ownership(arp_ip, macs): raise VMAddressVerificationError(nic.mac, arp_ip) logging.debug('Found/Verified IP %s for VM %s NIC %s' % ( @@ -525,6 +526,25 @@ class BaseVM(object): return arp_ip +def fill_addrs(self, addrs): + +Fill VM's nic address to the virtnet structure based on VM's address +structure addrs. + +@param addrs: Dict of interfaces and address +{if_name:{mac:['addrs',], +ipv4:['addrs',], +ipv6:['addrs',]}, + ...} + +for virtnet in self.virtnet: +for iface_name, iface in addrs.iteritems(): +if virtnet.mac in iface[mac]: +virtnet.ip = {ipv4: iface[ipv4], + ipv6: iface[ipv6]} +virtnet.g_nic_name = iface_name + + def get_port(self, port, nic_index=0): Return the port in host space corresponding to port in guest space. -- 1.7.7.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[Autotest][PATCH 5/5] virt: Repair import path.
Signed-off-by: Jiří Župka jzu...@redhat.com --- client/tests/virt/kvm/control.parallel|2 +- client/tests/virt/virttest/libvirt_xml.py |2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/client/tests/virt/kvm/control.parallel b/client/tests/virt/kvm/control.parallel index a33365b..70806c0 100644 --- a/client/tests/virt/kvm/control.parallel +++ b/client/tests/virt/kvm/control.parallel @@ -174,7 +174,7 @@ tests = list(parser.get_dicts()) # - # Run the tests # - -from autotest.client.virt import scheduler +from autotest.client.tests.virt.virttest import scheduler from autotest.client import utils # total_cpus defaults to the number of CPUs reported by /proc/cpuinfo diff --git a/client/tests/virt/virttest/libvirt_xml.py b/client/tests/virt/virttest/libvirt_xml.py index 79fe87a..992bfc2 100644 --- a/client/tests/virt/virttest/libvirt_xml.py +++ b/client/tests/virt/virttest/libvirt_xml.py @@ -11,7 +11,7 @@ import logging, os.path from autotest.client.shared import error, xml_utils -from autotest.client.virt import libvirt_vm, virsh +from virttest import libvirt_vm, virsh class LibvirtXMLError(Exception): -- 1.7.7.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/6] Reduce compaction scanning and lock contention
On Fri, Sep 21, 2012 at 10:13:33AM +0100, Richard Davies wrote: Hi Mel, Thank you for this series. I have applied on clean 3.6-rc5 and tested, and it works well for me - the lock contention is (still) gone and isolate_freepages_block is much reduced. Excellent! Here is a typical test with these patches: # grep -F '[k]' report | head -8 65.20% qemu-kvm [kernel.kallsyms] [k] clear_page_c 2.18% qemu-kvm [kernel.kallsyms] [k] isolate_freepages_block 1.56% qemu-kvm [kernel.kallsyms] [k] _raw_spin_lock 1.40% qemu-kvm [kernel.kallsyms] [k] svm_vcpu_run 1.38% swapper [kernel.kallsyms] [k] default_idle 1.35% qemu-kvm [kernel.kallsyms] [k] get_page_from_freelist 0.74% ksmd [kernel.kallsyms] [k] memcmp 0.72% qemu-kvm [kernel.kallsyms] [k] free_pages_prepare Ok, so that is more or less acceptable. I would like to reduce the scanning even further but I'll take this as a start -- largely because I do not have any new good ideas on how it could be reduced further without incurring a large cost in the page allocator :) I did manage to get a couple which were slightly worse, but nothing like as bad as before. Here are the results: # grep -F '[k]' report | head -8 45.60% qemu-kvm [kernel.kallsyms] [k] clear_page_c 11.26% qemu-kvm [kernel.kallsyms] [k] isolate_freepages_block 3.21% qemu-kvm [kernel.kallsyms] [k] _raw_spin_lock 2.27% ksmd [kernel.kallsyms] [k] memcmp 2.02%swapper [kernel.kallsyms] [k] default_idle 1.58% qemu-kvm [kernel.kallsyms] [k] svm_vcpu_run 1.30% qemu-kvm [kernel.kallsyms] [k] _raw_spin_lock_irqsave 1.09% qemu-kvm [kernel.kallsyms] [k] get_page_from_freelist # grep -F '[k]' report | head -8 61.29% qemu-kvm [kernel.kallsyms] [k] clear_page_c 4.52% qemu-kvm [kernel.kallsyms] [k] _raw_spin_lock_irqsave 2.64% qemu-kvm [kernel.kallsyms] [k] copy_page_c 1.61%swapper [kernel.kallsyms] [k] default_idle 1.57% qemu-kvm [kernel.kallsyms] [k] _raw_spin_lock 1.18% qemu-kvm [kernel.kallsyms] [k] get_page_from_freelist 1.18% qemu-kvm [kernel.kallsyms] [k] isolate_freepages_block 1.11% qemu-kvm [kernel.kallsyms] [k] svm_vcpu_run Were the boot times acceptable even when these slightly worse figures were recorded? I will follow up with the detailed traces for these three tests. Thank you! Thank you for the detailed reporting and the testing, it's much appreciated. I've already rebased the patches to Andrew's tree and tested them overnight and the figures look good on my side. I'll update the changelog and push them shortly. -- Mel Gorman SUSE Labs -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/6] Reduce compaction scanning and lock contention
Mel Gorman wrote: I did manage to get a couple which were slightly worse, but nothing like as bad as before. Here are the results: # grep -F '[k]' report | head -8 45.60% qemu-kvm [kernel.kallsyms] [k] clear_page_c 11.26% qemu-kvm [kernel.kallsyms] [k] isolate_freepages_block 3.21% qemu-kvm [kernel.kallsyms] [k] _raw_spin_lock 2.27% ksmd [kernel.kallsyms] [k] memcmp 2.02%swapper [kernel.kallsyms] [k] default_idle 1.58% qemu-kvm [kernel.kallsyms] [k] svm_vcpu_run 1.30% qemu-kvm [kernel.kallsyms] [k] _raw_spin_lock_irqsave 1.09% qemu-kvm [kernel.kallsyms] [k] get_page_from_freelist # grep -F '[k]' report | head -8 61.29% qemu-kvm [kernel.kallsyms] [k] clear_page_c 4.52% qemu-kvm [kernel.kallsyms] [k] _raw_spin_lock_irqsave 2.64% qemu-kvm [kernel.kallsyms] [k] copy_page_c 1.61%swapper [kernel.kallsyms] [k] default_idle 1.57% qemu-kvm [kernel.kallsyms] [k] _raw_spin_lock 1.18% qemu-kvm [kernel.kallsyms] [k] get_page_from_freelist 1.18% qemu-kvm [kernel.kallsyms] [k] isolate_freepages_block 1.11% qemu-kvm [kernel.kallsyms] [k] svm_vcpu_run Were the boot times acceptable even when these slightly worse figures were recorded? Yes, they were 10-20% slower as you might expect from the traces, rather than a factor slower. Thank you for the detailed reporting and the testing, it's much appreciated. I've already rebased the patches to Andrew's tree and tested them overnight and the figures look good on my side. I'll update the changelog and push them shortly. Great. On my side, I'm delighted that senior kernel developers such as you, Rik and Avi took our bug report seriously and helped fix it! Thank you, Richard. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 1/2] KVM: PPC: Book3S: Get/set guest SPRs using the GET/SET_ONE_REG interface
On Fri, Sep 21, 2012 at 11:15:51AM +0200, Alexander Graf wrote: On 21.09.2012, at 07:44, Paul Mackerras wrote: +union kvmppc_one_reg { + u32 wval; + u64 dval; Phew. Is this guaranteed to always pad on the right, rather than left? Absolutely (for big-endian targets). A union is basically a struct with all the members at offset 0. So we read N bytes into the union and then access the N-byte member. +static int one_reg_size(u64 id) +{ + id = (id KVM_REG_SIZE_MASK) KVM_REG_SIZE_SHIFT; + return 1 id; +} Rusty has a patch to export this as a generic function for all kvm targets. Check out http://permalink.gmane.org/gmane.comp.emulators.kvm.devel/97553 I actually like static inlines better, but we really should keep this code on top level :). Once Rusty's patch goes in I'd be happy to use it. + switch (reg-id) { + case KVM_REG_PPC_DAR: + val.dval = vcpu-arch.shared-dar; Hmm. I would like to keep the amount of places who need to manually know about the size of a register as low as possible. We do know the size of the register already on in the REG id and in sizeof(register), right? This code doesn't manually know about the size of the register. It could be that vcpu-arch.shared-dar is 4 bytes, or 2 bytes, instead of 8 bytes, and that wouldn't affect this code. All this code knows is that the definition of KVM_REG_PPC_DAR includes the 8 byte encoding. And since that definition is -- or will be -- part of the userspace ABI, it's not something that can be changed later, so I don't see any problem with just using val.dval here. I think there will be valid scenarios for them to be of different size. For example when a register suddenly increases in size. But the assignment should always happen on the size the register id tells us. It doesn't matter if a register changes in size, that doesn't affect this code. We may want to define an extra KVM_REG_PPC_* symbol if it increases in size, but that would be a different symbol to the ones we already have. As I said, we couldn't just arbitrarily change the existing symbol because that would break existing userspace programs. In other words the assignment already does happen on the size the register id tells us, with my code. So how about something like #define kvmppc_set_reg(id, val, reg) { \ switch (one_reg_size(id)) { \ case 4: val.wval = reg; break; \ case 8: val.dval = reg; break; \ default: BUG(); \ } \ } case KVM_REG_PPC_DAR: kvmppc_set_reg(reg-id, val, vcpu-arch.shared-dar); break; Maybe you can come up with something even easier though :). Seems unnecessarily complex to me. If we did something like that we should do kvmppc_set_reg(KVM_REG_PPC_DAR, val, vcpu-arch.shared-dar); and use a macro instead of one_reg_size() to give the compiler a better chance to do constant folding and only compile in the branch of the switch statement that we need. Btw, with such a scheme in place, we wouldn't even need the union. We could just reserve a char array and cast it in the setter/getter functions. I'm always wary of accessing a variable of one type as a different type using a cast on its address, because of the C99 type-punning rules. It's probably OK with a char array, but I think a union is cleaner, because you're explicitly stating that you want some storage that can be accessed as different types. Paul. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/6] Reduce compaction scanning and lock contention
On Fri, Sep 21, 2012 at 10:17:01AM +0100, Richard Davies wrote: Richard Davies wrote: I did manage to get a couple which were slightly worse, but nothing like as bad as before. Here are the results: # grep -F '[k]' report | head -8 45.60% qemu-kvm [kernel.kallsyms] [k] clear_page_c 11.26% qemu-kvm [kernel.kallsyms] [k] isolate_freepages_block 3.21% qemu-kvm [kernel.kallsyms] [k] _raw_spin_lock 2.27% ksmd [kernel.kallsyms] [k] memcmp 2.02%swapper [kernel.kallsyms] [k] default_idle 1.58% qemu-kvm [kernel.kallsyms] [k] svm_vcpu_run 1.30% qemu-kvm [kernel.kallsyms] [k] _raw_spin_lock_irqsave 1.09% qemu-kvm [kernel.kallsyms] [k] get_page_from_freelist # # captured on: Fri Sep 21 08:17:52 2012 # os release : 3.6.0-rc5-elastic+ # perf version : 3.5.2 # arch : x86_64 # nrcpus online : 16 # nrcpus avail : 16 # cpudesc : AMD Opteron(tm) Processor 6128 # cpuid : AuthenticAMD,16,9,1 # total memory : 131973276 kB # cmdline : /home/root/bin/perf record -g -a # event : name = cycles, type = 0, config = 0x0, config1 = 0x0, config2 = 0x0, excl_usr = 0, excl_kern = 0, id = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 } # HEADER_CPU_TOPOLOGY info available, use -I to display # HEADER_NUMA_TOPOLOGY info available, use -I to display # # # Samples: 283K of event 'cycles' # Event count (approx.): 109057976176 # # OverheadCommand Shared Object Symbol # . .. # 45.60% qemu-kvm [kernel.kallsyms] [k] clear_page_c | --- clear_page_c | |--93.35%-- do_huge_pmd_anonymous_page This is unavoidable. If THP was disabled, the cost would still be incurred, just on base pages instead of huge pages. SNIP 11.26% qemu-kvm [kernel.kallsyms] [k] isolate_freepages_block | --- isolate_freepages_block compaction_alloc migrate_pages compact_zone compact_zone_order try_to_compact_pages __alloc_pages_direct_compact __alloc_pages_nodemask alloc_pages_vma do_huge_pmd_anonymous_page And this is showing that we're still spending a lot of time scanning for free pages to isolate. I do not have a great idea on how this can be reduced further without interfering with the page allocator. One ok idea I considered in the past was using the buddy lists to find free pages quickly but there is first the problem that the buddy lists themselves may need to be searched and now that the zone lock is not held during the scan it would be particularly difficult. The harder problem is deciding when compaction finishes. I'll put more thought into it over the weekend and see if something falls out but I'm not going to hold up this series waiting for inspiration. 3.21% qemu-kvm [kernel.kallsyms] [k] _raw_spin_lock | --- _raw_spin_lock | |--39.96%-- tdp_page_fault Nothing very interesting here until... |--1.69%-- free_pcppages_bulk | | | |--77.53%-- drain_pages | | | | | |--95.77%-- drain_local_pages | | | | | | | |--97.90%-- generic_smp_call_function_interrupt | | | | smp_call_function_interrupt | | | | call_function_interrupt | | | | | | | | | |--23.37%-- kvm_vcpu_ioctl | | | | | do_vfs_ioctl | | | | | sys_ioctl | | | | | system_call_fastpath | | | | | ioctl | | | | | | | | | | | |--97.22%-- 0x1010006 | | |
Re: [PATCH v3 1/2] KVM: PPC: Book3S: Get/set guest SPRs using the GET/SET_ONE_REG interface
On 21.09.2012, at 11:52, Paul Mackerras wrote: On Fri, Sep 21, 2012 at 11:15:51AM +0200, Alexander Graf wrote: On 21.09.2012, at 07:44, Paul Mackerras wrote: +union kvmppc_one_reg { + u32 wval; + u64 dval; Phew. Is this guaranteed to always pad on the right, rather than left? Absolutely (for big-endian targets). A union is basically a struct with all the members at offset 0. So we read N bytes into the union and then access the N-byte member. +static int one_reg_size(u64 id) +{ + id = (id KVM_REG_SIZE_MASK) KVM_REG_SIZE_SHIFT; + return 1 id; +} Rusty has a patch to export this as a generic function for all kvm targets. Check out http://permalink.gmane.org/gmane.comp.emulators.kvm.devel/97553 I actually like static inlines better, but we really should keep this code on top level :). Once Rusty's patch goes in I'd be happy to use it. + switch (reg-id) { + case KVM_REG_PPC_DAR: + val.dval = vcpu-arch.shared-dar; Hmm. I would like to keep the amount of places who need to manually know about the size of a register as low as possible. We do know the size of the register already on in the REG id and in sizeof(register), right? This code doesn't manually know about the size of the register. It could be that vcpu-arch.shared-dar is 4 bytes, or 2 bytes, instead of 8 bytes, and that wouldn't affect this code. All this code knows is that the definition of KVM_REG_PPC_DAR includes the 8 byte encoding. And since that definition is -- or will be -- part of the userspace ABI, it's not something that can be changed later, so I don't see any problem with just using val.dval here. It's not really a problem, it's really about redundancy. We write the same thing twice case id that encodes length: val.length dependent value = foo; And every time we do constructs like that, someone will one day come in and do fun like case id with length 8: val.length 4 = foo; So if we can get rid of that complete bug category from day 1, it makes me happy :). I think there will be valid scenarios for them to be of different size. For example when a register suddenly increases in size. But the assignment should always happen on the size the register id tells us. It doesn't matter if a register changes in size, that doesn't affect this code. We may want to define an extra KVM_REG_PPC_* symbol if it increases in size, but that would be a different symbol to the ones we already have. As I said, we couldn't just arbitrarily change the existing symbol because that would break existing userspace programs. In other words the assignment already does happen on the size the register id tells us, with my code. Imagine we get new awesome 128bit GPRs now. The lower 64bit are still the same as they always were. The new machine is also backwards compatible, so we could be running 64bit code. To transfer the bigger registers we could then do case KVM_REG_PPC_GPR1: case KVM_REG_PPC_GPR1_128: kvmppc_set_reg(id, val, foo); But the bit I care a lot more about is the redundancy as explained above. So how about something like #define kvmppc_set_reg(id, val, reg) { \ switch (one_reg_size(id)) { \ case 4: val.wval = reg; break; \ case 8: val.dval = reg; break; \ default: BUG(); \ } \ } case KVM_REG_PPC_DAR: kvmppc_set_reg(reg-id, val, vcpu-arch.shared-dar); break; Maybe you can come up with something even easier though :). Seems unnecessarily complex to me. If we did something like that we should do kvmppc_set_reg(KVM_REG_PPC_DAR, val, vcpu-arch.shared-dar); and use a macro instead of one_reg_size() to give the compiler a better chance to do constant folding and only compile in the branch of the switch statement that we need. What variable type would the 3rd argument be? Also, constant folding should still work with one_reg_size(), no? Btw, with such a scheme in place, we wouldn't even need the union. We could just reserve a char array and cast it in the setter/getter functions. I'm always wary of accessing a variable of one type as a different type using a cast on its address, because of the C99 type-punning rules. It's probably OK with a char array, but I think a union is cleaner, because you're explicitly stating that you want some storage that can be accessed as different types. If unions are guaranteed to be laid out the way you describe above, that's perfectly fine for me. I just remember nightmares when I had to convert x86 incomplete unions (the 2 members were of different size) to ppc, where they started at the reverse end. A scheme that isn't tied to BE would maybe allow us to share code with ARM and x86. Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/9] Reduce compaction scanning and lock contention
Hi Andrew, Richard Davies and Shaohua Li have both reported lock contention problems in compaction on the zone and LRU locks as well as significant amounts of time being spent in compaction. This series aims to reduce lock contention and scanning rates to reduce that CPU usage. Richard reported at https://lkml.org/lkml/2012/9/21/91 that this series made a big different to a problem he reported in August (http://marc.info/?l=kvmm=134511507015614w=2). Patches 1-3 reverts existing patches in Andrew's tree that get replaced later in the series. Patch 4 is a fix for c67fe375 (mm: compaction: Abort async compaction if locks are contended or taking too long) to properly abort in all cases when contention is detected. Patch 5 defers acquiring the zone-lru_lock as long as possible. Patch 6 defers acquiring the zone-lock as lock as possible. Patch 7 reverts Rik's skip-free patches as the core concept gets reimplemented later and the remaining patches are easier to understand if this is reverted first. Patch 8 adds a pageblock-skip bit to the pageblock flags to cache what pageblocks should be skipped by the migrate and free scanners. This drastically reduces the amount of scanning compaction has to do. Patch 9 reimplements something similar to Rik's idea except it uses the pageblock-skip information to decide where the scanners should restart from and does not need to wrap around. I tested this on 3.6-rc6 + linux-next/akpm. Kernels tested were akpm-20120920 3.6-rc6 + linux-next/akpm as of Septeber 20th, 2012 lesslockPatches 1-6 revert Patches 1-7 cachefail Patches 1-8 skipuseless Patches 1-9 Stress high-order allocation tests looked ok. Success rates are more or less the same with the full series applied but there is an expectation that there is less opportunity to race with other allocation requests if there is less scanning. The time to complete the tests did not vary that much and are uninteresting as were the vmstat statistics so I will not present them here. Using ftrace I recorded how much scanning was done by compaction and got this 3.6.0-rc6 3.6.0-rc6 3.6.0-rc6 3.6.0-rc6 3.6.0-rc6 akpm-20120920 lockless revert-v2r2 cachefail skipuseless Total freescanned 360753976 515414028 565479007 17103281 18916589 Total freeisolated 285242935973694048601 670493 727840 Total freeefficiency0.0079%0.0070%0.0072%0.0392% 0.0385% Total migrate scanned 247728664 822729112 1004645830 17946827 14118903 Total migrate isolated 255532432459373437501 616359 658616 Total migrate efficiency0.0103%0.0039%0.0034%0.0343% 0.0466% The efficiency is worthless because of the nature of the test and the number of failures. The really interesting point as far as this patch series is concerned is the number of pages scanned. Note that reverting Rik's patches massively increases the number of pages scanned indicating that those patches really did make a difference to CPU usage. However, caching what pageblocks should be skipped has a much higher impact. With patches 1-8 applied, free page and migrate page scanning are both reduced by 95% in comparison to the akpm kernel. If the basic concept of Rik's patches are implemened on top then scanning then the free scanner barely changed but migrate scanning was further reduced. That said, tests on 3.6-rc5 indicated that the last patch had greater impact than what was measured here so it is a bit variable. One way or the other, this series has a large impact on the amount of scanning compaction does when there is a storm of THP allocations. include/linux/mmzone.h |5 +- include/linux/pageblock-flags.h | 19 +- mm/compaction.c | 397 +-- mm/internal.h | 11 +- mm/page_alloc.c |6 +- 5 files changed, 280 insertions(+), 158 deletions(-) -- 1.7.9.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/9] Revert mm: compaction: check lock contention first before taking lock
This reverts mm-compaction-check-lock-contention-first-before-taking-lock.patch as it is replaced by a later patch in the series. Signed-off-by: Mel Gorman mgor...@suse.de --- mm/compaction.c |5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 3bb7232..4a77b4b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -347,9 +347,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, /* Time to isolate some pages for migration */ cond_resched(); - locked = compact_trylock_irqsave(zone-lru_lock, flags, cc); - if (!locked) - return 0; + spin_lock_irqsave(zone-lru_lock, flags); + locked = true; for (; low_pfn end_pfn; low_pfn++) { struct page *page; -- 1.7.9.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/9] Revert mm: compaction: abort compaction loop if lock is contended or run too long
This reverts mm-compaction-abort-compaction-loop-if-lock-is-contended-or-run-too-long.patch as it is replaced by a later patch in the series. Signed-off-by: Mel Gorman mgor...@suse.de --- mm/compaction.c | 12 +--- mm/internal.h |2 +- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 1c873bb..614f18b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -70,7 +70,8 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, /* async aborts if taking too long or contended */ if (!cc-sync) { - cc-contended = true; + if (cc-contended) + *cc-contended = true; return false; } @@ -685,7 +686,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, /* Perform the isolation */ low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn); - if (!low_pfn || cc-contended) + if (!low_pfn) return ISOLATE_ABORT; cc-migrate_pfn = low_pfn; @@ -893,7 +894,6 @@ static unsigned long compact_zone_order(struct zone *zone, bool sync, bool *contended, struct page **page) { - unsigned long ret; struct compact_control cc = { .nr_freepages = 0, .nr_migratepages = 0, @@ -901,15 +901,13 @@ static unsigned long compact_zone_order(struct zone *zone, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, .sync = sync, + .contended = contended, .page = page, }; INIT_LIST_HEAD(cc.freepages); INIT_LIST_HEAD(cc.migratepages); - ret = compact_zone(zone, cc); - if (contended) - *contended = cc.contended; - return ret; + return compact_zone(zone, cc); } int sysctl_extfrag_threshold = 500; diff --git a/mm/internal.h b/mm/internal.h index eebbed5..386772f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -131,7 +131,7 @@ struct compact_control { int order; /* order a direct compactor needs */ int migratetype;/* MOVABLE, RECLAIMABLE etc */ struct zone *zone; - bool contended; /* True if a lock was contended */ + bool *contended;/* True if a lock was contended */ struct page **page; /* Page captured of requested size */ }; -- 1.7.9.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/9] mm: compaction: Acquire the zone-lru_lock as late as possible
Compactions migrate scanner acquires the zone-lru_lock when scanning a range of pages looking for LRU pages to acquire. It does this even if there are no LRU pages in the range. If multiple processes are compacting then this can cause severe locking contention. To make matters worse commit b2eef8c0 (mm: compaction: minimise the time IRQs are disabled while isolating pages for migration) releases the lru_lock every SWAP_CLUSTER_MAX pages that are scanned. This patch makes two changes to how the migrate scanner acquires the LRU lock. First, it only releases the LRU lock every SWAP_CLUSTER_MAX pages if the lock is contended. This reduces the number of times it unnecessarily disables and re-enables IRQs. The second is that it defers acquiring the LRU lock for as long as possible. If there are no LRU pages or the only LRU pages are transhuge then the LRU lock will not be acquired at all which reduces contention on zone-lru_lock. Signed-off-by: Mel Gorman mgor...@suse.de Acked-by: Rik van Riel r...@redhat.com --- mm/compaction.c | 63 +-- 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 6b55491..a6068ff 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -50,6 +50,11 @@ static inline bool migrate_async_suitable(int migratetype) return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; } +static inline bool should_release_lock(spinlock_t *lock) +{ + return need_resched() || spin_is_contended(lock); +} + /* * Compaction requires the taking of some coarse locks that are potentially * very heavily contended. Check if the process needs to be scheduled or @@ -62,7 +67,7 @@ static inline bool migrate_async_suitable(int migratetype) static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, bool locked, struct compact_control *cc) { - if (need_resched() || spin_is_contended(lock)) { + if (should_release_lock(lock)) { if (locked) { spin_unlock_irqrestore(lock, *flags); locked = false; @@ -327,7 +332,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, isolate_mode_t mode = 0; struct lruvec *lruvec; unsigned long flags; - bool locked; + bool locked = false; /* * Ensure that there are not too many pages isolated from the LRU @@ -347,23 +352,17 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, /* Time to isolate some pages for migration */ cond_resched(); - spin_lock_irqsave(zone-lru_lock, flags); - locked = true; for (; low_pfn end_pfn; low_pfn++) { struct page *page; /* give a chance to irqs before checking need_resched() */ - if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { - spin_unlock_irqrestore(zone-lru_lock, flags); - locked = false; + if (locked !((low_pfn+1) % SWAP_CLUSTER_MAX)) { + if (should_release_lock(zone-lru_lock)) { + spin_unlock_irqrestore(zone-lru_lock, flags); + locked = false; + } } - /* Check if it is ok to still hold the lock */ - locked = compact_checklock_irqsave(zone-lru_lock, flags, - locked, cc); - if (!locked || fatal_signal_pending(current)) - break; - /* * migrate_pfn does not necessarily start aligned to a * pageblock. Ensure that pfn_valid is called when moving @@ -403,21 +402,38 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, pageblock_nr = low_pfn pageblock_order; if (!cc-sync last_pageblock_nr != pageblock_nr !migrate_async_suitable(get_pageblock_migratetype(page))) { - low_pfn += pageblock_nr_pages; - low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; - last_pageblock_nr = pageblock_nr; - continue; + goto next_pageblock; } + /* Check may be lockless but that's ok as we recheck later */ if (!PageLRU(page)) continue; /* -* PageLRU is set, and lru_lock excludes isolation, -* splitting and collapsing (collapsing has already -* happened if PageLRU is set). +* PageLRU is set. lru_lock normally excludes isolation +* splitting and collapsing (collapsing has already happened +* if PageLRU is set) but the lock
[PATCH 4/9] mm: compaction: Abort compaction loop if lock is contended or run too long
From: Shaohua Li s...@fusionio.com Changelog since V2 o Fix BUG_ON triggered due to pages left on cc.migratepages o Make compact_zone_order() require non-NULL arg `contended' Changelog since V1 o only abort the compaction if lock is contended or run too long o Rearranged the code by Andrea Arcangeli. isolate_migratepages_range() might isolate no pages if for example when zone-lru_lock is contended and running asynchronous compaction. In this case, we should abort compaction, otherwise, compact_zone will run a useless loop and make zone-lru_lock is even contended. [minc...@kernel.org: Putback pages isolated for migration if aborting] [a...@linux-foundation.org: compact_zone_order requires non-NULL arg contended] Signed-off-by: Andrea Arcangeli aarca...@redhat.com Signed-off-by: Shaohua Li s...@fusionio.com Signed-off-by: Mel Gorman mgor...@suse.de Acked-by: Rik van Riel r...@redhat.com --- mm/compaction.c | 17 - mm/internal.h |2 +- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 614f18b..6b55491 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -70,8 +70,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, /* async aborts if taking too long or contended */ if (!cc-sync) { - if (cc-contended) - *cc-contended = true; + cc-contended = true; return false; } @@ -686,7 +685,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, /* Perform the isolation */ low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn); - if (!low_pfn) + if (!low_pfn || cc-contended) return ISOLATE_ABORT; cc-migrate_pfn = low_pfn; @@ -846,6 +845,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) switch (isolate_migratepages(zone, cc)) { case ISOLATE_ABORT: ret = COMPACT_PARTIAL; + putback_lru_pages(cc-migratepages); + cc-nr_migratepages = 0; goto out; case ISOLATE_NONE: continue; @@ -894,6 +895,7 @@ static unsigned long compact_zone_order(struct zone *zone, bool sync, bool *contended, struct page **page) { + unsigned long ret; struct compact_control cc = { .nr_freepages = 0, .nr_migratepages = 0, @@ -901,13 +903,18 @@ static unsigned long compact_zone_order(struct zone *zone, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, .sync = sync, - .contended = contended, .page = page, }; INIT_LIST_HEAD(cc.freepages); INIT_LIST_HEAD(cc.migratepages); - return compact_zone(zone, cc); + ret = compact_zone(zone, cc); + + VM_BUG_ON(!list_empty(cc.freepages)); + VM_BUG_ON(!list_empty(cc.migratepages)); + + *contended = cc.contended; + return ret; } int sysctl_extfrag_threshold = 500; diff --git a/mm/internal.h b/mm/internal.h index 386772f..eebbed5 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -131,7 +131,7 @@ struct compact_control { int order; /* order a direct compactor needs */ int migratetype;/* MOVABLE, RECLAIMABLE etc */ struct zone *zone; - bool *contended;/* True if a lock was contended */ + bool contended; /* True if a lock was contended */ struct page **page; /* Page captured of requested size */ }; -- 1.7.9.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 7/9] Revert mm: have order 0 compaction start off where it left
This reverts commit 7db8889a (mm: have order 0 compaction start off where it left) and commit de74f1cc (mm: have order 0 compaction start near a pageblock with free pages). These patches were a good idea and tests confirmed that they massively reduced the amount of scanning but the implementation is complex and tricky to understand. A later patch will cache what pageblocks should be skipped and reimplements the concept of compact_cached_free_pfn on top for both migration and free scanners. Signed-off-by: Mel Gorman mgor...@suse.de Acked-by: Rik van Riel r...@redhat.com --- include/linux/mmzone.h |4 --- mm/compaction.c| 65 mm/internal.h |6 - mm/page_alloc.c|5 4 files changed, 5 insertions(+), 75 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2daa54f..603d0b5 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -368,10 +368,6 @@ struct zone { */ spinlock_t lock; int all_unreclaimable; /* All pages pinned */ -#if defined CONFIG_COMPACTION || defined CONFIG_CMA - /* pfn where the last incremental compaction isolated free pages */ - unsigned long compact_cached_free_pfn; -#endif #ifdef CONFIG_MEMORY_HOTPLUG /* see spanned/present_pages for more description */ seqlock_t span_seqlock; diff --git a/mm/compaction.c b/mm/compaction.c index 8e56594..9fc1b61 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -539,20 +539,6 @@ next_pageblock: #endif /* CONFIG_COMPACTION || CONFIG_CMA */ #ifdef CONFIG_COMPACTION /* - * Returns the start pfn of the last page block in a zone. This is the starting - * point for full compaction of a zone. Compaction searches for free pages from - * the end of each zone, while isolate_freepages_block scans forward inside each - * page block. - */ -static unsigned long start_free_pfn(struct zone *zone) -{ - unsigned long free_pfn; - free_pfn = zone-zone_start_pfn + zone-spanned_pages; - free_pfn = ~(pageblock_nr_pages-1); - return free_pfn; -} - -/* * Based on information in the current compact_control, find blocks * suitable for isolating free pages from and then isolate them. */ @@ -620,19 +606,8 @@ static void isolate_freepages(struct zone *zone, * looking for free pages, the search will restart here as * page migration may have returned some pages to the allocator */ - if (isolated) { + if (isolated) high_pfn = max(high_pfn, pfn); - - /* -* If the free scanner has wrapped, update -* compact_cached_free_pfn to point to the highest -* pageblock with free pages. This reduces excessive -* scanning of full pageblocks near the end of the -* zone -*/ - if (cc-order 0 cc-wrapped) - zone-compact_cached_free_pfn = high_pfn; - } } /* split_free_page does not map the pages */ @@ -640,11 +615,6 @@ static void isolate_freepages(struct zone *zone, cc-free_pfn = high_pfn; cc-nr_freepages = nr_freepages; - - /* If compact_cached_free_pfn is reset then set it now */ - if (cc-order 0 !cc-wrapped - zone-compact_cached_free_pfn == start_free_pfn(zone)) - zone-compact_cached_free_pfn = high_pfn; } /* @@ -739,26 +709,8 @@ static int compact_finished(struct zone *zone, if (fatal_signal_pending(current)) return COMPACT_PARTIAL; - /* -* A full (order == -1) compaction run starts at the beginning and -* end of a zone; it completes when the migrate and free scanner meet. -* A partial (order 0) compaction can start with the free scanner -* at a random point in the zone, and may have to restart. -*/ - if (cc-free_pfn = cc-migrate_pfn) { - if (cc-order 0 !cc-wrapped) { - /* We started partway through; restart at the end. */ - unsigned long free_pfn = start_free_pfn(zone); - zone-compact_cached_free_pfn = free_pfn; - cc-free_pfn = free_pfn; - cc-wrapped = 1; - return COMPACT_CONTINUE; - } - return COMPACT_COMPLETE; - } - - /* We wrapped around and ended up where we started. */ - if (cc-wrapped cc-free_pfn = cc-start_free_pfn) + /* Compaction run completes if the migrate and free scanner meet */ + if (cc-free_pfn = cc-migrate_pfn) return COMPACT_COMPLETE; /* @@ -864,15 +816,8 @@ static int
[PATCH 6/9] mm: compaction: Acquire the zone-lock as late as possible
Compactions free scanner acquires the zone-lock when checking for PageBuddy pages and isolating them. It does this even if there are no PageBuddy pages in the range. This patch defers acquiring the zone lock for as long as possible. In the event there are no free pages in the pageblock then the lock will not be acquired at all which reduces contention on zone-lock. Signed-off-by: Mel Gorman mgor...@suse.de Acked-by: Rik van Riel r...@redhat.com --- mm/compaction.c | 141 ++- 1 file changed, 78 insertions(+), 63 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index a6068ff..8e56594 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -93,6 +93,28 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock, return compact_checklock_irqsave(lock, flags, false, cc); } +/* Returns true if the page is within a block suitable for migration to */ +static bool suitable_migration_target(struct page *page) +{ + + int migratetype = get_pageblock_migratetype(page); + + /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ + if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) + return false; + + /* If the page is a large free page, then allow migration */ + if (PageBuddy(page) page_order(page) = pageblock_order) + return true; + + /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ + if (migrate_async_suitable(migratetype)) + return true; + + /* Otherwise skip the block */ + return false; +} + static void compact_capture_page(struct compact_control *cc) { unsigned long flags; @@ -153,13 +175,16 @@ static void compact_capture_page(struct compact_control *cc) * pages inside of the pageblock (even though it may still end up isolating * some pages). */ -static unsigned long isolate_freepages_block(unsigned long blockpfn, +static unsigned long isolate_freepages_block(struct compact_control *cc, + unsigned long blockpfn, unsigned long end_pfn, struct list_head *freelist, bool strict) { int nr_scanned = 0, total_isolated = 0; struct page *cursor; + unsigned long flags; + bool locked = false; cursor = pfn_to_page(blockpfn); @@ -168,23 +193,38 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn, int isolated, i; struct page *page = cursor; - if (!pfn_valid_within(blockpfn)) { - if (strict) - return 0; - continue; - } + if (!pfn_valid_within(blockpfn)) + goto strict_check; nr_scanned++; - if (!PageBuddy(page)) { - if (strict) - return 0; - continue; - } + if (!PageBuddy(page)) + goto strict_check; + + /* +* The zone lock must be held to isolate freepages. This +* unfortunately this is a very coarse lock and can be +* heavily contended if there are parallel allocations +* or parallel compactions. For async compaction do not +* spin on the lock and we acquire the lock as late as +* possible. +*/ + locked = compact_checklock_irqsave(cc-zone-lock, flags, + locked, cc); + if (!locked) + break; + + /* Recheck this is a suitable migration target under lock */ + if (!strict !suitable_migration_target(page)) + break; + + /* Recheck this is a buddy page under lock */ + if (!PageBuddy(page)) + goto strict_check; /* Found a free page, break it into order-0 pages */ isolated = split_free_page(page); if (!isolated strict) - return 0; + goto strict_check; total_isolated += isolated; for (i = 0; i isolated; i++) { list_add(page-lru, freelist); @@ -196,9 +236,23 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn, blockpfn += isolated - 1; cursor += isolated - 1; } + + continue; + +strict_check: + /* Abort isolation if the caller requested strict isolation */ + if (strict) { + total_isolated = 0; + goto out; + } }
[PATCH 9/9] mm: compaction: Restart compaction from near where it left off
This is almost entirely based on Rik's previous patches and discussions with him about how this might be implemented. Order 0 compaction stops when enough free pages of the correct page order have been coalesced. When doing subsequent higher order allocations, it is possible for compaction to be invoked many times. However, the compaction code always starts out looking for things to compact at the start of the zone, and for free pages to compact things to at the end of the zone. This can cause quadratic behaviour, with isolate_freepages starting at the end of the zone each time, even though previous invocations of the compaction code already filled up all free memory on that end of the zone. This can cause isolate_freepages to take enormous amounts of CPU with certain workloads on larger memory systems. This patch caches where the migration and free scanner should start from on subsequent compaction invocations using the pageblock-skip information. When compaction starts it begins from the cached restart points and will update the cached restart points until a page is isolated or a pageblock is skipped that would have been scanned by synchronous compaction. Signed-off-by: Mel Gorman mgor...@suse.de Acked-by: Rik van Riel r...@redhat.com --- include/linux/mmzone.h |4 mm/compaction.c| 54 mm/internal.h |4 3 files changed, 53 insertions(+), 9 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a456361..e7792a3 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -370,6 +370,10 @@ struct zone { int all_unreclaimable; /* All pages pinned */ #if defined CONFIG_COMPACTION || defined CONFIG_CMA unsigned long compact_blockskip_expire; + + /* pfns where compaction scanners should start */ + unsigned long compact_cached_free_pfn; + unsigned long compact_cached_migrate_pfn; #endif #ifdef CONFIG_MEMORY_HOTPLUG /* see spanned/present_pages for more description */ diff --git a/mm/compaction.c b/mm/compaction.c index 9276bc8..4bd96f3 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -79,6 +79,9 @@ static void reset_isolation_suitable(struct zone *zone) */ if (time_before(jiffies, zone-compact_blockskip_expire)) return; + + zone-compact_cached_migrate_pfn = start_pfn; + zone-compact_cached_free_pfn = end_pfn; zone-compact_blockskip_expire = jiffies + (HZ * 5); /* Walk the zone and mark every pageblock as suitable for isolation */ @@ -99,13 +102,29 @@ static void reset_isolation_suitable(struct zone *zone) * If no pages were isolated then mark this pageblock to be skipped in the * future. The information is later cleared by reset_isolation_suitable(). */ -static void update_pageblock_skip(struct page *page, unsigned long nr_isolated) +static void update_pageblock_skip(struct compact_control *cc, + struct page *page, unsigned long nr_isolated, + bool migrate_scanner) { + struct zone *zone = cc-zone; if (!page) return; - if (!nr_isolated) + if (!nr_isolated) { + unsigned long pfn = page_to_pfn(page); set_pageblock_skip(page); + + /* Update where compaction should restart */ + if (migrate_scanner) { + if (!cc-finished_update_migrate + pfn zone-compact_cached_migrate_pfn) + zone-compact_cached_migrate_pfn = pfn; + } else { + if (!cc-finished_update_free + pfn zone-compact_cached_free_pfn) + zone-compact_cached_free_pfn = pfn; + } + } } static inline bool should_release_lock(spinlock_t *lock) @@ -315,7 +334,7 @@ out: /* Update the pageblock-skip if the whole pageblock was scanned */ if (blockpfn == end_pfn) - update_pageblock_skip(valid_page, total_isolated); + update_pageblock_skip(cc, valid_page, total_isolated, false); return total_isolated; } @@ -530,6 +549,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, */ if (!cc-sync last_pageblock_nr != pageblock_nr !migrate_async_suitable(get_pageblock_migratetype(page))) { + cc-finished_update_migrate = true; goto next_pageblock; } @@ -578,6 +598,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, VM_BUG_ON(PageTransCompound(page)); /* Successfully isolated */ + cc-finished_update_migrate = true; del_page_from_lru_list(page, lruvec, page_lru(page));
[PATCH 8/9] mm: compaction: Cache if a pageblock was scanned and no pages were isolated
When compaction was implemented it was known that scanning could potentially be excessive. The ideal was that a counter be maintained for each pageblock but maintaining this information would incur a severe penalty due to a shared writable cache line. It has reached the point where the scanning costs are an serious problem, particularly on long-lived systems where a large process starts and allocates a large number of THPs at the same time. Instead of using a shared counter, this patch adds another bit to the pageblock flags called PG_migrate_skip. If a pageblock is scanned by either migrate or free scanner and 0 pages were isolated, the pageblock is marked to be skipped in the future. When scanning, this bit is checked before any scanning takes place and the block skipped if set. The main difficulty with a patch like this is when to ignore the cached information? If it's ignored too often, the scanning rates will still be excessive. If the information is too stale then allocations will fail that might have otherwise succeeded. In this patch o CMA always ignores the information o If the migrate and free scanner meet then the cached information will be discarded if it's at least 5 seconds since the last time the cache was discarded o If there are a large number of allocation failures, discard the cache. The time-based heuristic is very clumsy but there are few choices for a better event. Depending solely on multiple allocation failures still allows excessive scanning when THP allocations are failing in quick succession due to memory pressure. Waiting until memory pressure is relieved would cause compaction to continually fail instead of using reclaim/compaction to try allocate the page. The time-based mechanism is clumsy but a better option is not obvious. Signed-off-by: Mel Gorman mgor...@suse.de Acked-by: Rik van Riel r...@redhat.com --- include/linux/mmzone.h |3 ++ include/linux/pageblock-flags.h | 19 +++- mm/compaction.c | 93 +-- mm/internal.h |1 + mm/page_alloc.c |1 + 5 files changed, 111 insertions(+), 6 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 603d0b5..a456361 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -368,6 +368,9 @@ struct zone { */ spinlock_t lock; int all_unreclaimable; /* All pages pinned */ +#if defined CONFIG_COMPACTION || defined CONFIG_CMA + unsigned long compact_blockskip_expire; +#endif #ifdef CONFIG_MEMORY_HOTPLUG /* see spanned/present_pages for more description */ seqlock_t span_seqlock; diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 19ef95d..eed27f4 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -30,6 +30,9 @@ enum pageblock_bits { PB_migrate, PB_migrate_end = PB_migrate + 3 - 1, /* 3 bits required for migrate types */ +#ifdef CONFIG_COMPACTION + PB_migrate_skip,/* If set the block is skipped by compaction */ +#endif /* CONFIG_COMPACTION */ NR_PAGEBLOCK_BITS }; @@ -65,10 +68,22 @@ unsigned long get_pageblock_flags_group(struct page *page, void set_pageblock_flags_group(struct page *page, unsigned long flags, int start_bitidx, int end_bitidx); +#ifdef CONFIG_COMPACTION +#define get_pageblock_skip(page) \ + get_pageblock_flags_group(page, PB_migrate_skip, \ + PB_migrate_skip + 1) +#define clear_pageblock_skip(page) \ + set_pageblock_flags_group(page, 0, PB_migrate_skip, \ + PB_migrate_skip + 1) +#define set_pageblock_skip(page) \ + set_pageblock_flags_group(page, 1, PB_migrate_skip, \ + PB_migrate_skip + 1) +#endif /* CONFIG_COMPACTION */ + #define get_pageblock_flags(page) \ - get_pageblock_flags_group(page, 0, NR_PAGEBLOCK_BITS-1) + get_pageblock_flags_group(page, 0, PB_migrate_end) #define set_pageblock_flags(page, flags) \ set_pageblock_flags_group(page, flags, \ - 0, NR_PAGEBLOCK_BITS-1) + 0, PB_migrate_end) #endif /* PAGEBLOCK_FLAGS_H */ diff --git a/mm/compaction.c b/mm/compaction.c index 9fc1b61..9276bc8 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -50,6 +50,64 @@ static inline bool migrate_async_suitable(int migratetype) return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; } +/* Returns true if the pageblock should be scanned for pages to isolate. */ +static inline bool
[PATCH 2/9] Revert mm-compaction-abort-compaction-loop-if-lock-is-contended-or-run-too-long-fix
This reverts mm-compaction-abort-compaction-loop-if-lock-is-contended-or-run-too-long-fix as it is replaced by a later patch in the series. Signed-off-by: Mel Gorman mgor...@suse.de --- mm/compaction.c |3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index 4a77b4b..1c873bb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -907,7 +907,8 @@ static unsigned long compact_zone_order(struct zone *zone, INIT_LIST_HEAD(cc.migratepages); ret = compact_zone(zone, cc); - *contended = cc.contended; + if (contended) + *contended = cc.contended; return ret; } -- 1.7.9.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH v3 00/19] ACPI memory hotplug
This is v3 of the ACPI memory hotplug functionality. Only x86_64 target is supported for now. Overview: Dimm device layout is modeled with a new qemu command line -dimm id=name,size=sz,node=pxm,populated=on|off The starting physical address for all dimms is calculated automatically from top of memory, skipping the pci hole at [PCI_HOLE_START, 4G). Node is defining numa proximity for this dimm. When not defined it defaults to zero. -dimm id=dimm0,size=512M,node=0,populated=off will define a 512M memory slot belonging to numa node 0. Dimms are added or removed with normal device_add, device_del operations: Hot-add syntax: device_add dimm,id=mydimm0 Hot-remove syntax: dimm_del dimm,id=mydimm0 Changes v2-v3 - qdev integration. Dimms are attached to a dimmbus. The dimmbus is a child of i440fx device in the pc machine. Hot-add and hot-remove are done with normal device_add / device_del operations on the dimmbus. New commands dimm_add and dimm_del are obsolete. (In previous versions, dimms were always present on the qdev tree, and dimm_add/del simply meant allocating or deallocating memory for the devices. This version actually does hot-operations on the qdev tree) - Add _PS3 method to allow OSPM-induced hot operations. - pci-window calculation in Seabios takes dimms into account(for both 32-bit and 64-bit windows) - rename new qmp commands: query-memory-total and query-memory-hotplug - balloon driver can see the hotplugged memory Changes v1-v2 - memory map is automatically calculated for hotplug dimms. Dimms are added from top-of-memory skipping the pci hole at [PCI_HOLE_START, 4G). - Renamed from -memslot to -dimm. Commands changed to dimm_add, dimm_del. - Seabios ejection array reduced to a byte. Use extraction macros for dimm ssdt. - additional SRAT paravirt info does not break previous SRAT fw_cfg layout. - Documentation of new acpi_piix4 registers and paravirt data. - add ACPI _OST support for _OST enabled guests. This allows qemu to receive notification for success / failure of memory hot-add and hot-remove operations. Guest needs to support _OST (https://lkml.org/lkml/2012/6/25/321) - add monitor info command to report total guest memory (initial + hot-added) - add command line options and monitor commands for batch dimm creation/population (obsolete from v3 onwards) Issues: - A main blocker issue is windows guest functionality. The patchset does not work for windows currently. My guess is the windows pnpmem driver does not like the seabios dimm device implementation (or the seabios dimm implementation is not fully ACPI-compliant). If someone can review the seabios patches or has any ideas to debug this, let me know. Testing on win2012 server RC or windows2008 consumer prerelease. When adding a DIMM, the device shows up in DeviceManager but does not work. Relevant messages: This device cannot start. (Code 10) Device configured(memory.inf) (UserPnP eventID 400) Device installed (memory.inf) ACPI/PNP0C80\2daba3ff1 was configured Device not started(PNPMEM) (Kernel-PnP eventID 411, kernelID) Device ACPI\PNP0C80\2daba3ff1 had a problem starting Driver Name: memory.inf (c:\Windows\system32\DRIVERS\pnpmem.sys 6.2.8400 winmain_win8rc)) Memory range:0x8000 - 0x9000 (Initial memory of VM is 2GB. The hotplugged DIMM was a 256GB with physical address range starting at 2GB ) Conflicting device list: No conflicts. Adding a 2nd or more dimms causes a crash (PNP_DETECTED_FATAL_ERROR with blue screen of death) and makes windows reboot. After this, the VM keeps rebooting with ACPI_BIOS_ERROR. The VM refuses to boot anymore once a 2nd (or more) extra dimm is plugged-in. - Is the dimmbus the correct way to go about integrating into qdev/qom? In a v1 comment, Anthony mentioned attaching dimms directly to an i440fx device as children. Is this possible without a bus? - Live migration works as long as the dimm layout (-dimm command line args) are identical at the source and destination qemu command line. Patch 10/19 creates the DimmDevice that corresponds to the unknown incoming ramblock. Ramblocks are migrated before qdev VMStates are migrated (the DimmDevice structure currently does not define a VMStateDescription). So the DimmDevice is handled diferrently than other devices. If this is not acceptable, any suggestions on how should it be reworked? - Hot-operation notification lists need to be added to migration state. Please review. Could people state which other issues they consider blocker for including this upstream? Does this patchset need to wait for 1.4 or could this be considered for 1.3 (assuming blockers are resolved)? The patchset has been revised every few months, but I will provide quicker version updates onwards. I can also bring this up on a weekly meeting agenda if needed. series is based on uq/master for qemu-kvm, and master for seabios. Can be found also at: http://github.com/vliaskov/qemu-kvm/commits/memhp-v3
[RFC PATCH v3 01/19][SeaBIOS] Add ACPI_EXTRACT_DEVICE* macros
This allows to extract the beginning, end and name of a Device object. Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com --- tools/acpi_extract.py | 28 1 files changed, 28 insertions(+), 0 deletions(-) diff --git a/tools/acpi_extract.py b/tools/acpi_extract.py index 167a322..cb2540e 100755 --- a/tools/acpi_extract.py +++ b/tools/acpi_extract.py @@ -195,6 +195,28 @@ def aml_package_start(offset): offset += 1 return offset + aml_pkglen_bytes(offset) + 1 +def aml_device_start(offset): +#0x5B 0x82 DeviceOp PkgLength NameString ProcID +if ((aml[offset] != 0x5B) or (aml[offset + 1] != 0x82)): +die( Name offset 0x%x: expected 0x5B 0x83 actual 0x%x 0x%x % + (offset, aml[offset], aml[offset + 1])); +return offset + +def aml_device_string(offset): +#0x5B 0x82 DeviceOp PkgLength NameString ProcID +start = aml_device_start(offset) +offset += 2 +pkglenbytes = aml_pkglen_bytes(offset) +offset += pkglenbytes +return offset + +def aml_device_end(offset): +start = aml_device_start(offset) +offset += 2 +pkglenbytes = aml_pkglen_bytes(offset) +pkglen = aml_pkglen(offset) +return offset + pkglen + lineno = 0 for line in fileinput.input(): # Strip trailing newline @@ -279,6 +301,12 @@ for i in range(len(asl)): offset = aml_processor_end(offset) elif (directive == ACPI_EXTRACT_PKG_START): offset = aml_package_start(offset) +elif (directive == ACPI_EXTRACT_DEVICE_START): +offset = aml_device_start(offset) +elif (directive == ACPI_EXTRACT_DEVICE_STRING): +offset = aml_device_string(offset) +elif (directive == ACPI_EXTRACT_DEVICE_END): +offset = aml_device_end(offset) else: die(Unsupported directive %s % directive) -- 1.7.9 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH v3 03/19][SeaBIOS] acpi-dsdt: Implement functions for memory hotplug
Extend the DSDT to include methods for handling memory hot-add and hot-remove notifications and memory device status requests. These functions are called from the memory device SSDT methods. Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com --- src/acpi-dsdt.dsl | 70 +++- 1 files changed, 68 insertions(+), 2 deletions(-) diff --git a/src/acpi-dsdt.dsl b/src/acpi-dsdt.dsl index 2060686..5d3e92b 100644 --- a/src/acpi-dsdt.dsl +++ b/src/acpi-dsdt.dsl @@ -737,6 +737,71 @@ DefinitionBlock ( } Return(One) } +/* Objects filled in by run-time generated SSDT */ +External(MTFY, MethodObj) +External(MEON, PkgObj) + +Method (CMST, 1, NotSerialized) { +// _STA method - return ON status of memdevice +// Local0 = MEON flag for this cpu +Store(DerefOf(Index(MEON, Arg0)), Local0) +If (Local0) { Return(0xF) } Else { Return(0x0) } +} + +/* Memory hotplug notify array */ +OperationRegion(MEST, SystemIO, 0xaf80, 32) +Field (MEST, ByteAcc, NoLock, Preserve) +{ +MES, 256 +} + +/* Memory eject byte */ +OperationRegion(MEMJ, SystemIO, 0xafa0, 1) +Field (MEMJ, ByteAcc, NoLock, Preserve) +{ +MPE, 8 +} + +Method(MESC, 0) { +// Local5 = active memdevice bitmap +Store (MES, Local5) +// Local2 = last read byte from bitmap +Store (Zero, Local2) +// Local0 = memory device iterator +Store (Zero, Local0) +While (LLess(Local0, SizeOf(MEON))) { +// Local1 = MEON flag for this memory device +Store(DerefOf(Index(MEON, Local0)), Local1) +If (And(Local0, 0x07)) { +// Shift down previously read bitmap byte +ShiftRight(Local2, 1, Local2) +} Else { +// Read next byte from memdevice bitmap +Store(DerefOf(Index(Local5, ShiftRight(Local0, 3))), Local2) +} +// Local3 = active state for this memory device +Store(And(Local2, 1), Local3) + +If (LNotEqual(Local1, Local3)) { +// State change - update MEON with new state +Store(Local3, Index(MEON, Local0)) +// Do MEM notify +If (LEqual(Local3, 1)) { +MTFY(Local0, 1) +} Else { +MTFY(Local0, 3) +} +} +Increment(Local0) +} +Return(One) +} + +Method (MPEJ, 2, NotSerialized) { +// _EJ0 method - eject callback +Store(Arg0, MPE) +Sleep(200) +} } @@ -759,8 +824,9 @@ DefinitionBlock ( // CPU hotplug event Return(\_SB.PRSC()) } -Method(_L03) { -Return(0x01) +Method(_E03) { +// Memory hotplug event +Return(\_SB.MESC()) } Method(_L04) { Return(0x01) -- 1.7.9 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH v3 04/19][SeaBIOS] acpi: generate hotplug memory devices
The memory device generation is guided by qemu paravirt info. Seabios first uses the info to setup SRAT entries for the hotplug-able memory slots. Afterwards, build_memssdt uses the created SRAT entries to generate appropriate memory device objects. One memory device (and corresponding SRAT entry) is generated for each hotplug-able qemu memslot. Currently no SSDT memory device is created for initial system memory. We only support up to 255 DIMMs for now (PackageOp used for the MEON array can only describe an array of at most 255 elements. VarPackageOp would be needed to support more than 255 devices) v1-v2: Seabios reads mems_sts from qemu to build e820_map SSDT size and some offsets are calculated with extraction macros. v2-v3: Minor name changes Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com --- src/acpi.c | 158 +-- 1 files changed, 152 insertions(+), 6 deletions(-) diff --git a/src/acpi.c b/src/acpi.c index 6d239fa..1223b52 100644 --- a/src/acpi.c +++ b/src/acpi.c @@ -13,6 +13,7 @@ #include pci_regs.h // PCI_INTERRUPT_LINE #include ioport.h // inl #include paravirt.h // qemu_cfg_irq0_override +#include memmap.h // /* ACPI tables init */ @@ -416,11 +417,26 @@ encodeLen(u8 *ssdt_ptr, int length, int bytes) #define PCIHP_AML (ssdp_pcihp_aml + *ssdt_pcihp_start) #define PCI_SLOTS 32 +/* 0x5B 0x82 DeviceOp PkgLength NameString DimmID */ +#define MEM_BASE 0xaf80 +#define MEM_AML (ssdm_mem_aml + *ssdt_mem_start) +#define MEM_SIZEOF (*ssdt_mem_end - *ssdt_mem_start) +#define MEM_OFFSET_HEX (*ssdt_mem_name - *ssdt_mem_start + 2) +#define MEM_OFFSET_ID (*ssdt_mem_id - *ssdt_mem_start) +#define MEM_OFFSET_PXM 31 +#define MEM_OFFSET_START 55 +#define MEM_OFFSET_END 63 +#define MEM_OFFSET_SIZE 79 + +u64 nb_hp_memslots = 0; +struct srat_memory_affinity *mem; + #define SSDT_SIGNATURE 0x54445353 // SSDT #define SSDT_HEADER_LENGTH 36 #include ssdt-susp.hex #include ssdt-pcihp.hex +#include ssdt-mem.hex #define PCI_RMV_BASE 0xae0c @@ -472,6 +488,111 @@ static void patch_pcihp(int slot, u8 *ssdt_ptr, u32 eject) } } +static void build_memdev(u8 *ssdt_ptr, int i, u64 mem_base, u64 mem_len, u8 node) +{ +memcpy(ssdt_ptr, MEM_AML, MEM_SIZEOF); +ssdt_ptr[MEM_OFFSET_HEX] = getHex(i 4); +ssdt_ptr[MEM_OFFSET_HEX+1] = getHex(i); +ssdt_ptr[MEM_OFFSET_ID] = i; +ssdt_ptr[MEM_OFFSET_PXM] = node; +*(u64*)(ssdt_ptr + MEM_OFFSET_START) = mem_base; +*(u64*)(ssdt_ptr + MEM_OFFSET_END) = mem_base + mem_len; +*(u64*)(ssdt_ptr + MEM_OFFSET_SIZE) = mem_len; +} + +static void* +build_memssdt(void) +{ +u64 mem_base; +u64 mem_len; +u8 node; +int i; +struct srat_memory_affinity *entry = mem; +u64 nb_memdevs = nb_hp_memslots; +u8 memslot_status, enabled; + +int length = ((1+3+4) + + (nb_memdevs * MEM_SIZEOF) + + (1+2+5+(12*nb_memdevs)) + + (6+2+1+(1*nb_memdevs))); +u8 *ssdt = malloc_high(sizeof(struct acpi_table_header) + length); +if (! ssdt) { +warn_noalloc(); +return NULL; +} +u8 *ssdt_ptr = ssdt + sizeof(struct acpi_table_header); + +// build Scope(_SB_) header +*(ssdt_ptr++) = 0x10; // ScopeOp +ssdt_ptr = encodeLen(ssdt_ptr, length-1, 3); +*(ssdt_ptr++) = '_'; +*(ssdt_ptr++) = 'S'; +*(ssdt_ptr++) = 'B'; +*(ssdt_ptr++) = '_'; + +for (i = 0; i nb_memdevs; i++) { +mem_base = (((u64)(entry-base_addr_high) 32 )| entry-base_addr_low); +mem_len = (((u64)(entry-length_high) 32 )| entry-length_low); +node = entry-proximity[0]; +build_memdev(ssdt_ptr, i, mem_base, mem_len, node); +ssdt_ptr += MEM_SIZEOF; +entry++; +} + +// build Method(MTFY, 2) {If (LEqual(Arg0, 0x00)) {Notify(CM00, Arg1)} ...} +*(ssdt_ptr++) = 0x14; // MethodOp +ssdt_ptr = encodeLen(ssdt_ptr, 2+5+(12*nb_memdevs), 2); +*(ssdt_ptr++) = 'M'; +*(ssdt_ptr++) = 'T'; +*(ssdt_ptr++) = 'F'; +*(ssdt_ptr++) = 'Y'; +*(ssdt_ptr++) = 0x02; +for (i=0; inb_memdevs; i++) { +*(ssdt_ptr++) = 0xA0; // IfOp + ssdt_ptr = encodeLen(ssdt_ptr, 11, 1); +*(ssdt_ptr++) = 0x93; // LEqualOp +*(ssdt_ptr++) = 0x68; // Arg0Op +*(ssdt_ptr++) = 0x0A; // BytePrefix +*(ssdt_ptr++) = i; +*(ssdt_ptr++) = 0x86; // NotifyOp +*(ssdt_ptr++) = 'M'; +*(ssdt_ptr++) = 'P'; +*(ssdt_ptr++) = getHex(i 4); +*(ssdt_ptr++) = getHex(i); +*(ssdt_ptr++) = 0x69; // Arg1Op +} + +// build Name(MEON, Package() { One, One, ..., Zero, Zero, ... }) +*(ssdt_ptr++) = 0x08; // NameOp +*(ssdt_ptr++) = 'M'; +*(ssdt_ptr++) = 'E'; +*(ssdt_ptr++) = 'O'; +*(ssdt_ptr++) = 'N'; +*(ssdt_ptr++) = 0x12; // PackageOp +ssdt_ptr = encodeLen(ssdt_ptr, 2+1+(1*nb_memdevs), 2); +*(ssdt_ptr++) =
[RFC PATCH v3 10/19] fix live-migration when populated=on is missing
Live migration works after memory hot-add events, as long as the qemu command line -dimm arguments are changed on the destination host to specify populated=on for the dimms that have been hot-added. If a command-line change has not occured, the destination host does not yet have the corresponding ramblock in its ram_list. Activate the dimm on the destination during ram_load. Perhaps several fields of the DimmDevice should be part of a VMStateDescription to handle migration in a cleaner way. But the problem is that ramblocks are checked before qdev vmstates. Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com --- arch_init.c | 24 +--- 1 files changed, 21 insertions(+), 3 deletions(-) diff --git a/arch_init.c b/arch_init.c index 5a1173e..b63caa7 100644 --- a/arch_init.c +++ b/arch_init.c @@ -45,6 +45,7 @@ #include hw/pcspk.h #include qemu/page_cache.h #include qmp-commands.h +#include hw/dimm.h #ifdef DEBUG_ARCH_INIT #define DPRINTF(fmt, ...) \ @@ -740,10 +741,27 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) } if (!block) { -fprintf(stderr, Unknown ramblock \%s\, cannot +/* this can happen if a dimm was hot-added at source host */ +bool ramblock_found = false; +if (dimm_add(id)) { +fprintf(stderr, Cannot add unknown ramblock \%s\, +cannot accept migration\n, id); +ret = -EINVAL; +goto done; +} +/* rescan ram_list, verify ramblock is there now */ +QLIST_FOREACH(block, ram_list.blocks, next) { +if (!strncmp(id, block-idstr, sizeof(id))) { +ramblock_found = true; +break; +} +} +if (!ramblock_found) { +fprintf(stderr, Unknown ramblock \%s\, cannot accept migration\n, id); -ret = -EINVAL; -goto done; +ret = -EINVAL; +goto done; +} } total_ram_bytes -= length; -- 1.7.9 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH v3 09/19] pc: Add dimm paravirt SRAT info
The numa_fw_cfg paravirt interface is extended to include SRAT information for all hotplug-able dimms. There are 3 words for each hotplug-able memory slot, denoting start address, size and node proximity. The new info is appended after existing numa info, so that the fw_cfg layout does not break. This information is used by Seabios to build hotplug memory device objects at runtime. nb_numa_nodes is set to 1 by default (not 0), so that we always pass srat info to SeaBIOS. v1-v2: Dimm SRAT info (#dimms) is appended at end of existing numa fw_cfg in order not to break existing layout Documentation of the new fwcfg layout is included in docs/specs/fwcfg.txt Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com --- docs/specs/fwcfg.txt | 28 hw/pc.c | 14 -- 2 files changed, 40 insertions(+), 2 deletions(-) create mode 100644 docs/specs/fwcfg.txt diff --git a/docs/specs/fwcfg.txt b/docs/specs/fwcfg.txt new file mode 100644 index 000..55f96d9 --- /dev/null +++ b/docs/specs/fwcfg.txt @@ -0,0 +1,28 @@ +QEMU-BIOS Paravirt Documentation +-- + +This document describes paravirt data structures passed from QEMU to BIOS. + +FW_CFG_NUMA paravirt info + +The SRAT info passed from QEMU to BIOS has the following layout: + +--- +#nodes | cpu0_pxm | cpu1_pxm | ... | cpulast_pxm | node0_mem | node1_mem | ... | nodelast_mem + +--- +#dimms | dimm0_start | dimm0_sz | dimm0_pxm | ... | dimmlast_start | dimmlast_sz | dimmlast_pxm + +Entry 0 contains the number of numa nodes (nb_numa_nodes). + +Entries 1..max_cpus: The next max_cpus entries describe node proximity for each +one of the vCPUs in the system. + +Entries max_cpus+1..max_cpus+nb_numa_nodes+1: The next nb_numa_nodes entries +describe the memory size for each one of the NUMA nodes in the system. + +Entry max_cpus+nb_numa_nodes+1 contains the number of memory dimms (nb_hp_dimms) + +The last 3 * nb_hp_dimms entries are organized in triplets: Each triplet contains +the physical address offset, size (in bytes), and node proximity for the +respective dimm. diff --git a/hw/pc.c b/hw/pc.c index 2c9664d..f2604ae 100644 --- a/hw/pc.c +++ b/hw/pc.c @@ -598,6 +598,7 @@ static void *bochs_bios_init(void) uint8_t *smbios_table; size_t smbios_len; uint64_t *numa_fw_cfg; +uint64_t *hp_dimms_fw_cfg; int i, j; register_ioport_write(0x400, 1, 2, bochs_bios_write, NULL); @@ -632,8 +633,10 @@ static void *bochs_bios_init(void) /* allocate memory for the NUMA channel: one (64bit) word for the number * of nodes, one word for each VCPU-node and one word for each node to * hold the amount of memory. + * Finally one word for the number of hotplug memory slots and three words + * for each hotplug memory slot (start address, size and node proximity). */ -numa_fw_cfg = g_malloc0((1 + max_cpus + nb_numa_nodes) * 8); +numa_fw_cfg = g_malloc0((2 + max_cpus + nb_numa_nodes + 3 * nb_hp_dimms) * 8); numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes); for (i = 0; i max_cpus; i++) { for (j = 0; j nb_numa_nodes; j++) { @@ -646,8 +649,15 @@ static void *bochs_bios_init(void) for (i = 0; i nb_numa_nodes; i++) { numa_fw_cfg[max_cpus + 1 + i] = cpu_to_le64(node_mem[i]); } + +numa_fw_cfg[1 + max_cpus + nb_numa_nodes] = cpu_to_le64(nb_hp_dimms); + +hp_dimms_fw_cfg = numa_fw_cfg + 2 + max_cpus + nb_numa_nodes; +if (nb_hp_dimms) +setup_fwcfg_hp_dimms(hp_dimms_fw_cfg); + fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, (uint8_t *)numa_fw_cfg, - (1 + max_cpus + nb_numa_nodes) * 8); + (2 + max_cpus + nb_numa_nodes + 3 * nb_hp_dimms) * 8); return fw_cfg; } -- 1.7.9 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH v3 11/19] Implement qmp and hmp commands for notification lists
Guest can respond to ACPI hotplug events e.g. with _EJ or _OST method. This patch implements a tail queue to store guest notifications for memory hot-add and hot-remove requests. Guest responses for memory hotplug command on a per-dimm basis can be detected with the new hmp command info memhp or the new qmp command query-memhp Examples: (qemu) device_add dimm,id=ram0 (qemu) info memory-hotplug dimm: ram0 hot-add success or dimm: ram0 hot-add failure (qemu) device_del ram3 (qemu) info memory-hotplug dimm: ram3 hot-remove success or dimm: ram3 hot-remove failure Results are removed from the queue once read. This patch only queues _EJ events that signal hot-remove success. For _OST event queuing, which cover the hot-remove failure and hot-add success/failure cases, the _OST patches in this series are are also needed. These notification items should probably be part of migration state (not yet implemented). Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com --- hmp-commands.hx |2 + hmp.c| 17 ++ hmp.h|1 + hw/dimm.c| 62 +- hw/dimm.h|2 +- monitor.c|7 ++ qapi-schema.json | 26 ++ qmp-commands.hx | 37 8 files changed, 152 insertions(+), 2 deletions(-) diff --git a/hmp-commands.hx b/hmp-commands.hx index ed67e99..cfb1b67 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -1462,6 +1462,8 @@ show device tree show qdev device model list @item info roms show roms +@item info memory-hotplug +show memory-hotplug @end table ETEXI diff --git a/hmp.c b/hmp.c index ba6fbd3..4b3d63d 100644 --- a/hmp.c +++ b/hmp.c @@ -1168,3 +1168,20 @@ void hmp_screen_dump(Monitor *mon, const QDict *qdict) qmp_screendump(filename, err); hmp_handle_error(mon, err); } + +void hmp_info_memory_hotplug(Monitor *mon) +{ +MemHpInfoList *info; +MemHpInfoList *item; +MemHpInfo *dimm; + +info = qmp_query_memory_hotplug(NULL); +for (item = info; item; item = item-next) { +dimm = item-value; +monitor_printf(mon, dimm: %s %s %s\n, dimm-dimm, +dimm-request, dimm-result); +dimm-dimm = NULL; +} + +qapi_free_MemHpInfoList(info); +} diff --git a/hmp.h b/hmp.h index 48b9c59..986705a 100644 --- a/hmp.h +++ b/hmp.h @@ -73,5 +73,6 @@ void hmp_getfd(Monitor *mon, const QDict *qdict); void hmp_closefd(Monitor *mon, const QDict *qdict); void hmp_send_key(Monitor *mon, const QDict *qdict); void hmp_screen_dump(Monitor *mon, const QDict *qdict); +void hmp_info_memory_hotplug(Monitor *mon); #endif diff --git a/hw/dimm.c b/hw/dimm.c index 288b997..fbd93a8 100644 --- a/hw/dimm.c +++ b/hw/dimm.c @@ -65,6 +65,7 @@ static void dimm_bus_initfn(Object *obj) DimmBus *bus = DIMM_BUS(obj); QTAILQ_INIT(bus-dimmconfig_list); QTAILQ_INIT(bus-dimmlist); +QTAILQ_INIT(bus-dimm_hp_result_queue); QTAILQ_FOREACH_SAFE(dimm_cfg, dimmconfig_list, nextdimmcfg, next_dimm_cfg) { QTAILQ_REMOVE(dimmconfig_list, dimm_cfg, nextdimmcfg); @@ -236,20 +237,78 @@ void dimm_notify(uint32_t idx, uint32_t event) { DimmBus *bus = main_memory_bus; DimmDevice *s; +DimmConfig *slotcfg; +struct dimm_hp_result *result; + s = dimm_find_from_idx(idx); assert(s != NULL); +result = g_malloc0(sizeof(*result)); +slotcfg = dimmcfg_find_from_name(DEVICE(s)-id); +result-dimmname = slotcfg-name; switch(event) { case DIMM_REMOVE_SUCCESS: dimm_depopulate(s); -qdev_simple_unplug_cb((DeviceState*)s); QTAILQ_REMOVE(bus-dimmlist, s, nextdimm); +qdev_simple_unplug_cb((DeviceState*)s); +QTAILQ_INSERT_TAIL(bus-dimm_hp_result_queue, result, next); break; default: +g_free(result); break; } } +MemHpInfoList *qmp_query_memory_hotplug(Error **errp) +{ +DimmBus *bus = main_memory_bus; +MemHpInfoList *head = NULL, *cur_item = NULL, *info; +struct dimm_hp_result *item, *nextitem; + +QTAILQ_FOREACH_SAFE(item, bus-dimm_hp_result_queue, next, nextitem) { + +info = g_malloc0(sizeof(*info)); +info-value = g_malloc0(sizeof(*info-value)); +info-value-dimm = g_malloc0(sizeof(char) * 32); +info-value-request = g_malloc0(sizeof(char) * 16); +info-value-result = g_malloc0(sizeof(char) * 16); +switch (item-ret) { +case DIMM_REMOVE_SUCCESS: +strcpy(info-value-request, hot-remove); +strcpy(info-value-result, success); +break; +case DIMM_REMOVE_FAIL: +strcpy(info-value-request, hot-remove); +strcpy(info-value-result, failure); +break; +case DIMM_ADD_SUCCESS: +strcpy(info-value-request, hot-add); +
[RFC PATCH v3 17/19][SeaBIOS] Implement _PS3 method for memory device
Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com --- src/acpi-dsdt.dsl | 15 +++ src/ssdt-mem.dsl |4 2 files changed, 19 insertions(+), 0 deletions(-) diff --git a/src/acpi-dsdt.dsl b/src/acpi-dsdt.dsl index 0d37bbc..8a18770 100644 --- a/src/acpi-dsdt.dsl +++ b/src/acpi-dsdt.dsl @@ -784,6 +784,13 @@ DefinitionBlock ( MIF, 8 } +/* Memory _PS3 byte */ +OperationRegion(MPSB, SystemIO, 0xafa4, 1) +Field (MPSB, ByteAcc, NoLock, Preserve) +{ +MPS, 8 +} + Method(MESC, 0) { // Local5 = active memdevice bitmap Store (MES, Local5) @@ -824,6 +831,14 @@ DefinitionBlock ( Store(Arg0, MPE) Sleep(200) } + +Method (MPS3, 1, NotSerialized) { +// _PS3 method - power-off method +Store(Arg0, MPS) +Store(Zero, Index(MEON, Arg0)) +Sleep(200) +} + Method (MOST, 3, Serialized) { // _OST method - OS status indication Switch (And(Arg0, 0xFF)) { diff --git a/src/ssdt-mem.dsl b/src/ssdt-mem.dsl index 041d301..7423fc6 100644 --- a/src/ssdt-mem.dsl +++ b/src/ssdt-mem.dsl @@ -39,6 +39,7 @@ DefinitionBlock (ssdt-mem.aml, SSDT, 0x02, BXPC, CSSDT, 0x1) External(CMST, MethodObj) External(MPEJ, MethodObj) External(MOST, MethodObj) +External(MPS3, MethodObj) Name(_CRS, ResourceTemplate() { QwordMemory( @@ -64,6 +65,9 @@ DefinitionBlock (ssdt-mem.aml, SSDT, 0x02, BXPC, CSSDT, 0x1) Method (_OST, 3) { MOST(Arg0, Arg1, ID) } +Method (_PS3, 0) { +MPS3(ID) +} } } -- 1.7.9 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH v3 18/19] Implement _PS3 for dimm
This will allow us to update dimm state on OSPM-initiated eject operations e.g. with echo 1 /sys/bus/acpi/devices/PNP0C80\:00/eject Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com --- docs/specs/acpi_hotplug.txt |7 +++ hw/acpi_piix4.c |5 + hw/dimm.c |3 +++ hw/dimm.h |3 ++- 4 files changed, 17 insertions(+), 1 deletions(-) diff --git a/docs/specs/acpi_hotplug.txt b/docs/specs/acpi_hotplug.txt index 536da16..69868fe 100644 --- a/docs/specs/acpi_hotplug.txt +++ b/docs/specs/acpi_hotplug.txt @@ -45,3 +45,10 @@ insertion failed. Written by ACPI memory device _OST method to notify qemu of failed hot-add. Write-only. +Memory Dimm _PS3 power-off initiated by OSPM (IO port 0xafa4, 1-byte access): +--- +Dimm hot-add _PS3 initiated by OSPM. Byte value indicates Dimm slot which +entered D3 state. + +Written by ACPI memory device _PS3 method to notify qemu of power-off state for +the dimm. Write-only. diff --git a/hw/acpi_piix4.c b/hw/acpi_piix4.c index 8bf58a6..aad78ca 100644 --- a/hw/acpi_piix4.c +++ b/hw/acpi_piix4.c @@ -52,6 +52,7 @@ #define MEM_OST_REMOVE_FAIL 0xafa1 #define MEM_OST_ADD_SUCCESS 0xafa2 #define MEM_OST_ADD_FAIL 0xafa3 +#define MEM_PS3 0xafa4 #define PIIX4_MEM_HOTPLUG_STATUS 8 #define PIIX4_PCI_HOTPLUG_STATUS 2 @@ -545,6 +546,9 @@ static void gpe_writeb(void *opaque, uint32_t addr, uint32_t val) case MEM_OST_ADD_FAIL: dimm_notify(val, DIMM_ADD_FAIL); break; +case MEM_PS3: +dimm_notify(val, DIMM_OSPM_POWEROFF); +break; default: acpi_gpe_ioport_writeb(s-ar, addr, val); } @@ -621,6 +625,7 @@ static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s) register_ioport_write(MEM_OST_REMOVE_FAIL, 1, 1, gpe_writeb, s); register_ioport_write(MEM_OST_ADD_SUCCESS, 1, 1, gpe_writeb, s); register_ioport_write(MEM_OST_ADD_FAIL, 1, 1, gpe_writeb, s); +register_ioport_write(MEM_PS3, 1, 1, gpe_writeb, s); for(i = 0; i DIMM_BITMAP_BYTES; i++) { s-gperegs.mems_sts[i] = 0; diff --git a/hw/dimm.c b/hw/dimm.c index b993668..08f66d5 100644 --- a/hw/dimm.c +++ b/hw/dimm.c @@ -319,6 +319,9 @@ void dimm_notify(uint32_t idx, uint32_t event) qdev_simple_unplug_cb((DeviceState*)s); QTAILQ_INSERT_TAIL(bus-dimm_hp_result_queue, result, next); break; +case DIMM_OSPM_POWEROFF: +if (bus-dimm_revert) +bus-dimm_revert(bus-dimm_hotplug_qdev, s, 1); default: g_free(result); break; diff --git a/hw/dimm.h b/hw/dimm.h index ce091fe..8d73b8f 100644 --- a/hw/dimm.h +++ b/hw/dimm.h @@ -15,7 +15,8 @@ typedef enum { DIMM_REMOVE_SUCCESS = 0, DIMM_REMOVE_FAIL = 1, DIMM_ADD_SUCCESS = 2, -DIMM_ADD_FAIL = 3 +DIMM_ADD_FAIL = 3, +DIMM_OSPM_POWEROFF = 4 } dimm_hp_result_code; typedef enum { -- 1.7.9 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH v3 08/19] pc: calculate dimm physical addresses and adjust memory map
Dimm physical address offsets are calculated automatically and memory map is adjusted accordingly. If a DIMM can fit before the PCI_HOLE_START (currently 0xe000), it will be added normally, otherwise its physical address will be above 4GB. Also create memory bus on i440fx-pcihost device. Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com --- hw/pc.c | 41 + hw/pc.h |6 ++ hw/pc_piix.c | 20 ++-- vl.c |1 + 4 files changed, 62 insertions(+), 6 deletions(-) diff --git a/hw/pc.c b/hw/pc.c index 112739a..2c9664d 100644 --- a/hw/pc.c +++ b/hw/pc.c @@ -52,6 +52,7 @@ #include arch_init.h #include bitmap.h #include vga-pci.h +#include dimm.h /* output Bochs bios info messages */ //#define DEBUG_BIOS @@ -93,6 +94,9 @@ struct e820_table { static struct e820_table e820_table; struct hpet_fw_config hpet_cfg = {.count = UINT8_MAX}; +ram_addr_t below_4g_hp_mem_size = 0; +ram_addr_t above_4g_hp_mem_size = 0; +extern target_phys_addr_t ram_hp_offset; void gsi_handler(void *opaque, int n, int level) { GSIState *s = opaque; @@ -1160,3 +1164,40 @@ void pc_pci_device_init(PCIBus *pci_bus) pci_create_simple(pci_bus, -1, lsi53c895a); } } + + +/* Function to configure memory offsets of hotpluggable dimms */ + +target_phys_addr_t pc_set_hp_memory_offset(uint64_t size) +{ +target_phys_addr_t ret; + +/* on first call, initialize ram_hp_offset */ +if (!ram_hp_offset) { +if (ram_size = PCI_HOLE_START ) { +ram_hp_offset = 0x1LL + (ram_size - PCI_HOLE_START); +} else { +ram_hp_offset = ram_size; +} +} + +if (ram_hp_offset = 0x1LL) { +ret = ram_hp_offset; +above_4g_hp_mem_size += size; +ram_hp_offset += size; +} +/* if dimm fits before pci hole, append it normally */ +else if (ram_hp_offset + size = PCI_HOLE_START) { +ret = ram_hp_offset; +below_4g_hp_mem_size += size; +ram_hp_offset += size; +} +/* otherwise place it above 4GB */ +else { +ret = 0x1LL; +above_4g_hp_mem_size += size; +ram_hp_offset = 0x1LL + size; +} + +return ret; +} diff --git a/hw/pc.h b/hw/pc.h index e4db071..f3304fc 100644 --- a/hw/pc.h +++ b/hw/pc.h @@ -10,6 +10,7 @@ #include memory.h #include ioapic.h +#define PCI_HOLE_START 0xe000 /* PC-style peripherals (also used by other machines). */ /* serial.c */ @@ -214,6 +215,11 @@ static inline bool isa_ne2000_init(ISABus *bus, int base, int irq, NICInfo *nd) /* pc_sysfw.c */ void pc_system_firmware_init(MemoryRegion *rom_memory); +/* memory hotplug */ +target_phys_addr_t pc_set_hp_memory_offset(uint64_t size); +extern ram_addr_t below_4g_hp_mem_size; +extern ram_addr_t above_4g_hp_mem_size; + /* e820 types */ #define E820_RAM1 #define E820_RESERVED 2 diff --git a/hw/pc_piix.c b/hw/pc_piix.c index 88ff041..d1fd276 100644 --- a/hw/pc_piix.c +++ b/hw/pc_piix.c @@ -43,6 +43,7 @@ #include xen.h #include memory.h #include exec-memory.h +#include dimm.h #ifdef CONFIG_XEN # include xen/hvm/hvm_info_table.h #endif @@ -155,9 +156,9 @@ static void pc_init1(MemoryRegion *system_memory, kvmclock_create(); } -if (ram_size = 0xe000 ) { -above_4g_mem_size = ram_size - 0xe000; -below_4g_mem_size = 0xe000; +if (ram_size = PCI_HOLE_START ) { +above_4g_mem_size = ram_size - PCI_HOLE_START; +below_4g_mem_size = PCI_HOLE_START; } else { above_4g_mem_size = 0; below_4g_mem_size = ram_size; @@ -172,6 +173,9 @@ static void pc_init1(MemoryRegion *system_memory, rom_memory = system_memory; } +/* adjust memory map for hotplug dimms */ +dimm_calc_offsets(pc_set_hp_memory_offset); + /* allocate ram and load rom/bios */ if (!xen_enabled()) { fw_cfg = pc_memory_init(system_memory, @@ -192,9 +196,11 @@ static void pc_init1(MemoryRegion *system_memory, if (pci_enabled) { pci_bus = i440fx_init(i440fx_state, piix3_devfn, isa_bus, gsi, system_memory, system_io, ram_size, - below_4g_mem_size, - 0x1ULL - below_4g_mem_size, - 0x1ULL + above_4g_mem_size, + below_4g_mem_size + below_4g_hp_mem_size, + 0x1ULL - below_4g_mem_size +- below_4g_hp_mem_size, + 0x1ULL + above_4g_mem_size ++ above_4g_hp_mem_size, (sizeof(target_phys_addr_t) == 4 ? 0 : ((uint64_t)1 62)), @@ -223,6 +229,8 @@ static void pc_init1(MemoryRegion *system_memory,
[RFC PATCH v3 19/19][SeaBIOS] Calculate pcimem_start and pcimem64_start from SRAT entries
pcimem_start and pcimem64_start are adjusted from srat entries. For this reason, paravirt info (NUMA SRAT entries and number of cpus) need to be read before pci_setup. Imho, this is an ugly code change since SRAT bios tables and number of cpus have to be read earlier. But the advantage is that no new paravirt interface is introduced. Suggestions to make the code change cleaner are welcome. The alternative patch (will be sent as a reply to this patch) implements a paravirt interface to read the starting values of pcimem_start and pcimem64_start from QEMU. Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com --- src/acpi.c| 82 src/acpi.h|3 ++ src/pciinit.c |6 +++- src/post.c|3 ++ src/smp.c |4 +++ 5 files changed, 72 insertions(+), 26 deletions(-) diff --git a/src/acpi.c b/src/acpi.c index 1223b52..9e99aa7 100644 --- a/src/acpi.c +++ b/src/acpi.c @@ -428,7 +428,10 @@ encodeLen(u8 *ssdt_ptr, int length, int bytes) #define MEM_OFFSET_END 63 #define MEM_OFFSET_SIZE 79 -u64 nb_hp_memslots = 0; +u64 nb_hp_memslots = 0, nb_numanodes; +u64 *numa_data, *hp_memdata; +u64 below_4g_hp_mem_size = 0; +u64 above_4g_hp_mem_size = 0; struct srat_memory_affinity *mem; #define SSDT_SIGNATURE 0x54445353 // SSDT @@ -763,17 +766,7 @@ acpi_build_srat_memory(struct srat_memory_affinity *numamem, static void * build_srat(void) { -int nb_numa_nodes = qemu_cfg_get_numa_nodes(); - -u64 *numadata = malloc_tmphigh(sizeof(u64) * (MaxCountCPUs + nb_numa_nodes)); -if (!numadata) { -warn_noalloc(); -return NULL; -} - -qemu_cfg_get_numa_data(numadata, MaxCountCPUs + nb_numa_nodes); - -qemu_cfg_get_numa_data(nb_hp_memslots, 1); +int nb_numa_nodes = nb_numanodes; struct system_resource_affinity_table *srat; int srat_size = sizeof(*srat) + sizeof(struct srat_processor_affinity) * MaxCountCPUs + @@ -782,7 +775,7 @@ build_srat(void) srat = malloc_high(srat_size); if (!srat) { warn_noalloc(); -free(numadata); +free(numa_data); return NULL; } @@ -791,6 +784,7 @@ build_srat(void) struct srat_processor_affinity *core = (void*)(srat + 1); int i; u64 curnode; +u64 *numadata = numa_data; for (i = 0; i MaxCountCPUs; ++i) { core-type = SRAT_PROCESSOR; @@ -847,15 +841,7 @@ build_srat(void) mem = (void*)numamem; if (nb_hp_memslots) { -u64 *hpmemdata = malloc_tmphigh(sizeof(u64) * (3 * nb_hp_memslots)); -if (!hpmemdata) { -warn_noalloc(); -free(hpmemdata); -free(numadata); -return NULL; -} - -qemu_cfg_get_numa_data(hpmemdata, 3 * nb_hp_memslots); +u64 *hpmemdata = hp_memdata; for (i = 1; i nb_hp_memslots + 1; ++i) { mem_base = *hpmemdata++; @@ -865,7 +851,7 @@ build_srat(void) numamem++; slots++; } -free(hpmemdata); +free(hp_memdata); } for (; slots nb_numa_nodes + nb_hp_memslots + 2; slots++) { @@ -875,10 +861,58 @@ build_srat(void) build_header((void*)srat, SRAT_SIGNATURE, srat_size, 1); -free(numadata); +free(numa_data); return srat; } +/* QEMU paravirt SRAT entries need to be read in before pci initilization */ +void read_srat_early(void) +{ +int i; + +nb_numanodes = qemu_cfg_get_numa_nodes(); +u64 *hpmemdata; +u64 mem_len, mem_base; + +numa_data = malloc_tmphigh(sizeof(u64) * (MaxCountCPUs + nb_numanodes)); +if (!numa_data) { +warn_noalloc(); +} + +qemu_cfg_get_numa_data(numa_data, MaxCountCPUs + nb_numanodes); +qemu_cfg_get_numa_data(nb_hp_memslots, 1); + +if (nb_hp_memslots) { +hp_memdata = malloc_tmphigh(sizeof(u64) * (3 * nb_hp_memslots)); +if (!hp_memdata) { +warn_noalloc(); +free(hp_memdata); +free(numa_data); +} + +qemu_cfg_get_numa_data(hp_memdata, 3 * nb_hp_memslots); +hpmemdata = hp_memdata; + +for (i = 1; i nb_hp_memslots + 1; ++i) { +mem_base = *hpmemdata++; +mem_len = *hpmemdata++; +hpmemdata++; +if (mem_base = 0x1LL) { +above_4g_hp_mem_size += mem_len; +} +/* if dimm fits before pci hole, append it normally */ +else if (mem_base + mem_len = BUILD_PCIMEM_START) { +below_4g_hp_mem_size += mem_len; +} +/* otherwise place it above 4GB */ +else { +above_4g_hp_mem_size += mem_len; +} +} + +} +} + static const struct pci_device_id acpi_find_tbl[] = { /* PIIX4 Power Management device. */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371AB_3, NULL), diff --git a/src/acpi.h b/src/acpi.h index cb21561..d29837f
[RFC PATCH v3 16/19] Update dimm state on reset
in case of hot-remove failure on a guest that does not implement _OST, the dimm bitmaps in qemu and Seabios show the dimm as unplugged, but the dimm is still present on the qdev/memory bus. To avoid this inconsistency, we set the dimm state to active/hot-plugged on a reset of the associated acpi_pm device. This way the dimm is still active after a VM reboot and dimm visibility has always the same behaviour, regardless of _OST support in the guest. Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com --- hw/acpi_piix4.c |1 + hw/dimm.c | 20 hw/dimm.h |1 + 3 files changed, 22 insertions(+), 0 deletions(-) diff --git a/hw/acpi_piix4.c b/hw/acpi_piix4.c index f7220d4..8bf58a6 100644 --- a/hw/acpi_piix4.c +++ b/hw/acpi_piix4.c @@ -373,6 +373,7 @@ static void piix4_reset(void *opaque) pci_conf[0x5B] = 0x02; } piix4_update_hotplug(s); +dimm_state_sync(); } static void piix4_powerdown(void *opaque, int irq, int power_failing) diff --git a/hw/dimm.c b/hw/dimm.c index 1521462..b993668 100644 --- a/hw/dimm.c +++ b/hw/dimm.c @@ -182,6 +182,26 @@ static DimmDevice *dimm_find_from_idx(uint32_t idx) return NULL; } +void dimm_state_sync(void) +{ +DimmBus *bus = main_memory_bus; +DimmDevice *slot; + +/* if a hot-remove operation is pending on reset, it means the hot-remove + * operation has failed, but the guest hasn't notified us e.g. because the + * guest does not provide _OST notifications. The device is still present on + * the dimmbus, but the qemu and Seabios dimm bitmaps show this device as + * unplugged. To avoid this inconsistency, we set the dimm bits to active + * i.e. hot-plugged for each dimm present on the dimmbus. + */ +QTAILQ_FOREACH(slot, bus-dimmlist, nextdimm) { +if (slot-pending == DIMM_REMOVE_PENDING) { +if (bus-dimm_revert) +bus-dimm_revert(bus-dimm_hotplug_qdev, slot, 0); +} +} +} + /* used to create a dimm device, only on incoming migration of a hotplugged * RAMBlock */ diff --git a/hw/dimm.h b/hw/dimm.h index a6c6e6f..ce091fe 100644 --- a/hw/dimm.h +++ b/hw/dimm.h @@ -95,5 +95,6 @@ void main_memory_bus_create(Object *parent); void dimm_config_create(char *id, uint64_t size, uint64_t node, uint32_t dimm_idx, uint32_t populated); uint64_t get_hp_memory_total(void); +void dimm_state_sync(void); #endif -- 1.7.9 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH v3 15/19] Add _OST dimm support
This allows qemu to receive notifications from the guest OS on success or failure of a memory hotplug request. The guest OS needs to implement the _OST functionality for this to work (linux-next: http://lkml.org/lkml/2012/6/25/321) This patch also updates dimm bitmap state and hot-remove pending flag on hot-remove fail. This allows failed hot operations to be retried at anytime. This only works for guests that use _OST notification. Also adds new _OST registers in docs/specs/acpi_hotplug.txt Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com --- docs/specs/acpi_hotplug.txt | 25 + hw/acpi_piix4.c | 35 ++- hw/dimm.c | 28 +++- hw/dimm.h | 10 +- 4 files changed, 95 insertions(+), 3 deletions(-) diff --git a/docs/specs/acpi_hotplug.txt b/docs/specs/acpi_hotplug.txt index cf86242..536da16 100644 --- a/docs/specs/acpi_hotplug.txt +++ b/docs/specs/acpi_hotplug.txt @@ -20,3 +20,28 @@ ejected. Written by ACPI memory device _EJ0 method to notify qemu of successfull hot-removal. Write-only. + +Memory Dimm ejection failure notification (IO port 0xafa1, 1-byte access): +--- +Dimm hot-remove _OST notification. Byte value indicates Dimm slot for which +ejection failed. + +Written by ACPI memory device _OST method to notify qemu of failed +hot-removal. Write-only. + +Memory Dimm insertion success notification (IO port 0xafa2, 1-byte access): +--- +Dimm hot-remove _OST notification. Byte value indicates Dimm slot for which +insertion succeeded. + +Written by ACPI memory device _OST method to notify qemu of failed +hot-add. Write-only. + +Memory Dimm insertion failure notification (IO port 0xafa3, 1-byte access): +--- +Dimm hot-remove _OST notification. Byte value indicates Dimm slot for which +insertion failed. + +Written by ACPI memory device _OST method to notify qemu of failed +hot-add. Write-only. + diff --git a/hw/acpi_piix4.c b/hw/acpi_piix4.c index 8776669..f7220d4 100644 --- a/hw/acpi_piix4.c +++ b/hw/acpi_piix4.c @@ -49,6 +49,9 @@ #define PCI_RMV_BASE 0xae0c #define MEM_BASE 0xaf80 #define MEM_EJ_BASE 0xafa0 +#define MEM_OST_REMOVE_FAIL 0xafa1 +#define MEM_OST_ADD_SUCCESS 0xafa2 +#define MEM_OST_ADD_FAIL 0xafa3 #define PIIX4_MEM_HOTPLUG_STATUS 8 #define PIIX4_PCI_HOTPLUG_STATUS 2 @@ -87,6 +90,7 @@ typedef struct PIIX4PMState { uint8_t s4_val; } PIIX4PMState; +static int piix4_dimm_revert(DeviceState *qdev, DimmDevice *dev, int add); static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s); #define ACPI_ENABLE 0xf1 @@ -531,6 +535,15 @@ static void gpe_writeb(void *opaque, uint32_t addr, uint32_t val) case MEM_EJ_BASE: dimm_notify(val, DIMM_REMOVE_SUCCESS); break; +case MEM_OST_REMOVE_FAIL: +dimm_notify(val, DIMM_REMOVE_FAIL); +break; +case MEM_OST_ADD_SUCCESS: +dimm_notify(val, DIMM_ADD_SUCCESS); +break; +case MEM_OST_ADD_FAIL: +dimm_notify(val, DIMM_ADD_FAIL); +break; default: acpi_gpe_ioport_writeb(s-ar, addr, val); } @@ -604,13 +617,16 @@ static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s) register_ioport_read(MEM_BASE, DIMM_BITMAP_BYTES, 1, gpe_readb, s); register_ioport_write(MEM_EJ_BASE, 1, 1, gpe_writeb, s); +register_ioport_write(MEM_OST_REMOVE_FAIL, 1, 1, gpe_writeb, s); +register_ioport_write(MEM_OST_ADD_SUCCESS, 1, 1, gpe_writeb, s); +register_ioport_write(MEM_OST_ADD_FAIL, 1, 1, gpe_writeb, s); for(i = 0; i DIMM_BITMAP_BYTES; i++) { s-gperegs.mems_sts[i] = 0; } pci_bus_hotplug(bus, piix4_device_hotplug, s-dev.qdev); -dimm_bus_hotplug(piix4_dimm_hotplug, s-dev.qdev); +dimm_bus_hotplug(piix4_dimm_hotplug, piix4_dimm_revert, s-dev.qdev); } static void enable_device(PIIX4PMState *s, int slot) @@ -656,6 +672,23 @@ static int piix4_dimm_hotplug(DeviceState *qdev, DimmDevice *dev, int return 0; } +static int piix4_dimm_revert(DeviceState *qdev, DimmDevice *dev, int add) +{ +PCIDevice *pci_dev = DO_UPCAST(PCIDevice, qdev, qdev); +PIIX4PMState *s = DO_UPCAST(PIIX4PMState, dev, pci_dev); +struct gpe_regs *g = s-gperegs; +DimmDevice *slot = DIMM(dev); +int idx = slot-idx; + +if (add) { +g-mems_sts[idx/8] = ~(1 (idx%8)); +} +else { +g-mems_sts[idx/8] |= (1 (idx%8)); +} +return 0; +} + static int piix4_device_hotplug(DeviceState *qdev, PCIDevice *dev, PCIHotplugState state) { diff --git a/hw/dimm.c b/hw/dimm.c index 21626f6..1521462 100644 --- a/hw/dimm.c +++ b/hw/dimm.c @@
[RFC PATCH v3 14/19][SeaBIOS] Add _OST dimm method
Add support for _OST method. _OST method will write into the correct I/O byte to signal success / failure of hot-add or hot-remove to qemu. Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com --- src/acpi-dsdt.dsl | 50 ++ src/ssdt-mem.dsl |4 2 files changed, 54 insertions(+), 0 deletions(-) diff --git a/src/acpi-dsdt.dsl b/src/acpi-dsdt.dsl index 5d3e92b..0d37bbc 100644 --- a/src/acpi-dsdt.dsl +++ b/src/acpi-dsdt.dsl @@ -762,6 +762,28 @@ DefinitionBlock ( MPE, 8 } + +/* Memory hot-remove notify failure byte */ +OperationRegion(MEEF, SystemIO, 0xafa1, 1) +Field (MEEF, ByteAcc, NoLock, Preserve) +{ +MEF, 8 +} + +/* Memory hot-add notify success byte */ +OperationRegion(MPIS, SystemIO, 0xafa2, 1) +Field (MPIS, ByteAcc, NoLock, Preserve) +{ +MIS, 8 +} + +/* Memory hot-add notify failure byte */ +OperationRegion(MPIF, SystemIO, 0xafa3, 1) +Field (MPIF, ByteAcc, NoLock, Preserve) +{ +MIF, 8 +} + Method(MESC, 0) { // Local5 = active memdevice bitmap Store (MES, Local5) @@ -802,6 +824,34 @@ DefinitionBlock ( Store(Arg0, MPE) Sleep(200) } +Method (MOST, 3, Serialized) { +// _OST method - OS status indication +Switch (And(Arg0, 0xFF)) { +Case(0x3) +{ +Switch(And(Arg1, 0xFF)) { +Case(0x1) { +Store(Arg2, MEF) +// Revert MEON flag for this memory device to one +Store(One, Index(MEON, Arg2)) +} +} +} +Case(0x1) +{ +Switch(And(Arg1, 0xFF)) { +Case(0x0) { +Store(Arg2, MIS) +} +Case(0x1) { +Store(Arg2, MIF) +// Revert MEON flag for this memory device to zero +Store(Zero, Index(MEON, Arg2)) +} +} +} +} +} } diff --git a/src/ssdt-mem.dsl b/src/ssdt-mem.dsl index ee322f0..041d301 100644 --- a/src/ssdt-mem.dsl +++ b/src/ssdt-mem.dsl @@ -38,6 +38,7 @@ DefinitionBlock (ssdt-mem.aml, SSDT, 0x02, BXPC, CSSDT, 0x1) External(CMST, MethodObj) External(MPEJ, MethodObj) +External(MOST, MethodObj) Name(_CRS, ResourceTemplate() { QwordMemory( @@ -60,6 +61,9 @@ DefinitionBlock (ssdt-mem.aml, SSDT, 0x02, BXPC, CSSDT, 0x1) Method (_EJ0, 1, NotSerialized) { MPEJ(ID, Arg0) } +Method (_OST, 3) { +MOST(Arg0, Arg1, ID) +} } } -- 1.7.9 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH v3 13/19] balloon: update with hotplugged memory
query-balloon and info balloon should report total memory available to the guest. balloon inflate/ deflate can also use all memory available to the guest (initial + hotplugged memory) Ballon driver has been minimaly tested with the patch, please review and test. Caveat: if the guest does not online hotplugged-memory, it's easy for a balloon inflate command to OOM a guest. Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com --- hw/virtio-balloon.c | 13 + 1 files changed, 9 insertions(+), 4 deletions(-) diff --git a/hw/virtio-balloon.c b/hw/virtio-balloon.c index dd1a650..bca21bc 100644 --- a/hw/virtio-balloon.c +++ b/hw/virtio-balloon.c @@ -22,6 +22,7 @@ #include virtio-balloon.h #include kvm.h #include exec-memory.h +#include dimm.h #if defined(__linux__) #include sys/mman.h @@ -147,10 +148,11 @@ static void virtio_balloon_set_config(VirtIODevice *vdev, VirtIOBalloon *dev = to_virtio_balloon(vdev); struct virtio_balloon_config config; uint32_t oldactual = dev-actual; +uint64_t hotplugged_ram_size = get_hp_memory_total(); memcpy(config, config_data, 8); dev-actual = le32_to_cpu(config.actual); if (dev-actual != oldactual) { -qemu_balloon_changed(ram_size - +qemu_balloon_changed(ram_size + hotplugged_ram_size - (dev-actual VIRTIO_BALLOON_PFN_SHIFT)); } } @@ -188,17 +190,20 @@ static void virtio_balloon_stat(void *opaque, BalloonInfo *info) info-actual = ram_size - ((uint64_t) dev-actual VIRTIO_BALLOON_PFN_SHIFT); +info-actual += get_hp_memory_total(); } static void virtio_balloon_to_target(void *opaque, ram_addr_t target) { VirtIOBalloon *dev = opaque; +uint64_t hotplugged_ram_size = get_hp_memory_total(); -if (target ram_size) { -target = ram_size; +if (target ram_size + hotplugged_ram_size) { +target = ram_size + hotplugged_ram_size; } if (target) { -dev-num_pages = (ram_size - target) VIRTIO_BALLOON_PFN_SHIFT; +dev-num_pages = (ram_size + hotplugged_ram_size - target) + VIRTIO_BALLOON_PFN_SHIFT; virtio_notify_config(dev-vdev); } } -- 1.7.9 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH v3 12/19] Implement info memory-total and query-memory-total
Returns total physical memory available to guest in bytes, including hotplugged memory. Note that the number reported here may be different from what the guest sees e.g. if the guest has not logically onlined hotplugged memory. This functionality is provided independently of a balloon device, since a guest can be using ACPI memory hotplug without using a balloon device. Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com --- hmp-commands.hx |2 ++ hmp.c|7 +++ hmp.h|1 + hw/dimm.c| 21 + hw/dimm.h|1 + monitor.c|7 +++ qapi-schema.json | 11 +++ qmp-commands.hx | 20 8 files changed, 70 insertions(+), 0 deletions(-) diff --git a/hmp-commands.hx b/hmp-commands.hx index cfb1b67..988d207 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -1464,6 +1464,8 @@ show qdev device model list show roms @item info memory-hotplug show memory-hotplug +@item info memory-total +show memory-total @end table ETEXI diff --git a/hmp.c b/hmp.c index 4b3d63d..cc31ddc 100644 --- a/hmp.c +++ b/hmp.c @@ -1185,3 +1185,10 @@ void hmp_info_memory_hotplug(Monitor *mon) qapi_free_MemHpInfoList(info); } + +void hmp_info_memory_total(Monitor *mon) +{ +uint64_t ram_total; +ram_total = (uint64_t)qmp_query_memory_total(NULL); +monitor_printf(mon, MemTotal: %lu \n, ram_total); +} diff --git a/hmp.h b/hmp.h index 986705a..ab96dba 100644 --- a/hmp.h +++ b/hmp.h @@ -74,5 +74,6 @@ void hmp_closefd(Monitor *mon, const QDict *qdict); void hmp_send_key(Monitor *mon, const QDict *qdict); void hmp_screen_dump(Monitor *mon, const QDict *qdict); void hmp_info_memory_hotplug(Monitor *mon); +void hmp_info_memory_total(Monitor *mon); #endif diff --git a/hw/dimm.c b/hw/dimm.c index fbd93a8..21626f6 100644 --- a/hw/dimm.c +++ b/hw/dimm.c @@ -28,6 +28,7 @@ static DimmBus *main_memory_bus; /* the following list is used to hold dimm config info before machine * initialization. After machine init, the list is emptied and not used anymore.*/ static DimmConfiglist dimmconfig_list = QTAILQ_HEAD_INITIALIZER(dimmconfig_list); +extern ram_addr_t ram_size; static void dimmbus_dev_print(Monitor *mon, DeviceState *dev, int indent); static char *dimmbus_get_fw_dev_path(DeviceState *dev); @@ -233,6 +234,26 @@ void setup_fwcfg_hp_dimms(uint64_t *fw_cfg_slots) } } +uint64_t get_hp_memory_total(void) +{ +DimmBus *bus = main_memory_bus; +DimmDevice *slot; +uint64_t info = 0; + +QTAILQ_FOREACH(slot, bus-dimmlist, nextdimm) { +info += slot-size; +} +return info; +} + +int64_t qmp_query_memory_total(Error **errp) +{ +uint64_t info; +info = ram_size + get_hp_memory_total(); + +return (int64_t)info; +} + void dimm_notify(uint32_t idx, uint32_t event) { DimmBus *bus = main_memory_bus; diff --git a/hw/dimm.h b/hw/dimm.h index 95251ba..21225be 100644 --- a/hw/dimm.h +++ b/hw/dimm.h @@ -86,5 +86,6 @@ int dimm_add(char *id); void main_memory_bus_create(Object *parent); void dimm_config_create(char *id, uint64_t size, uint64_t node, uint32_t dimm_idx, uint32_t populated); +uint64_t get_hp_memory_total(void); #endif diff --git a/monitor.c b/monitor.c index be9a1d9..4f5ea60 100644 --- a/monitor.c +++ b/monitor.c @@ -2747,6 +2747,13 @@ static mon_cmd_t info_cmds[] = { .mhandler.info = hmp_info_memory_hotplug, }, { +.name = memory-total, +.args_type = , +.params = , +.help = show total memory size, +.mhandler.info = hmp_info_memory_total, +}, +{ .name = NULL, }, }; diff --git a/qapi-schema.json b/qapi-schema.json index 3706a2a..c1d2571 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -2581,3 +2581,14 @@ # Since: 1.3 ## { 'command': 'query-memory-hotplug', 'returns': ['MemHpInfo'] } + +## +# @query-memory-total: +# +# Returns total memory in bytes, including hotplugged dimms +# +# Returns: int +# +# Since: 1.3 +## +{ 'command': 'query-memory-total', 'returns': 'int' } diff --git a/qmp-commands.hx b/qmp-commands.hx index e50dcc2..20b7eea 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -2576,3 +2576,23 @@ Example: } EQMP + +{ +.name = query-memory-total, +.args_type = , +.mhandler.cmd_new = qmp_marshal_input_query_memory_total +}, +SQMP +query-memory-total +-- + +Return total memory in bytes, including hotplugged dimms + +Example: + +- { execute: query-memory-total } +- { + return: 1073741824 + } + +EQMP -- 1.7.9 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH v3 07/19] acpi_piix4: Implement memory device hotplug registers
A 32-byte register is used to present up to 256 hotplug-able memory devices to BIOS and OSPM. Hot-add and hot-remove functions trigger an ACPI hotplug event through these. Only reads are allowed from these registers. An ACPI hot-remove event but needs to wait for OSPM to eject the device. We use a single-byte register to know when OSPM has called the _EJ function for a particular dimm. A write to this byte will depopulate the respective dimm. Only writes are allowed to this byte. v1-v2: mems_sts address moved from 0xaf20 to 0xaf80 (to accomodate more space for cpu-hotplugging in the future). _EJ array is reduced to a single byte. Add documentation in docs/specs/acpi_hotplug.txt v2-v3: minor name changes Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com --- docs/specs/acpi_hotplug.txt | 22 + hw/acpi_piix4.c | 73 -- 2 files changed, 91 insertions(+), 4 deletions(-) create mode 100644 docs/specs/acpi_hotplug.txt diff --git a/docs/specs/acpi_hotplug.txt b/docs/specs/acpi_hotplug.txt new file mode 100644 index 000..cf86242 --- /dev/null +++ b/docs/specs/acpi_hotplug.txt @@ -0,0 +1,22 @@ +QEMU-ACPI BIOS hotplug interface +-- +This document describes the interface between QEMU and the ACPI BIOS for non-PCI +space. For the PCI interface please look at docs/specs/acpi_pci_hotplug.txt + +QEMU-ACPI BIOS memory hotplug interface +-- + +Memory Dimm status array (IO port 0xaf80-0xaf9f, 1-byte access): +--- +Dimm hot-plug notification pending. One bit per slot. + +Read by ACPI BIOS GPE.3 handler to notify OS of memory hot-add or hot-remove +events. Read-only. + +Memory Dimm ejection success notification (IO port 0xafa0, 1-byte access): +--- +Dimm hot-remove _EJ0 notification. Byte value indicates Dimm slot that was +ejected. + +Written by ACPI memory device _EJ0 method to notify qemu of successfull +hot-removal. Write-only. diff --git a/hw/acpi_piix4.c b/hw/acpi_piix4.c index c56220b..8776669 100644 --- a/hw/acpi_piix4.c +++ b/hw/acpi_piix4.c @@ -28,6 +28,8 @@ #include range.h #include ioport.h #include fw_cfg.h +#include sysbus.h +#include dimm.h //#define DEBUG @@ -45,9 +47,15 @@ #define PCI_DOWN_BASE 0xae04 #define PCI_EJ_BASE 0xae08 #define PCI_RMV_BASE 0xae0c +#define MEM_BASE 0xaf80 +#define MEM_EJ_BASE 0xafa0 +#define PIIX4_MEM_HOTPLUG_STATUS 8 #define PIIX4_PCI_HOTPLUG_STATUS 2 +struct gpe_regs { +uint8_t mems_sts[DIMM_BITMAP_BYTES]; +}; struct pci_status { uint32_t up; /* deprecated, maintained for migration compatibility */ uint32_t down; @@ -69,6 +77,7 @@ typedef struct PIIX4PMState { Notifier machine_ready; /* for pci hotplug */ +struct gpe_regs gperegs; struct pci_status pci0_status; uint32_t pci0_hotplug_enable; uint32_t pci0_slot_device_present; @@ -93,8 +102,8 @@ static void pm_update_sci(PIIX4PMState *s) ACPI_BITMASK_POWER_BUTTON_ENABLE | ACPI_BITMASK_GLOBAL_LOCK_ENABLE | ACPI_BITMASK_TIMER_ENABLE)) != 0) || -(((s-ar.gpe.sts[0] s-ar.gpe.en[0]) - PIIX4_PCI_HOTPLUG_STATUS) != 0); +(((s-ar.gpe.sts[0] s-ar.gpe.en[0]) + (PIIX4_PCI_HOTPLUG_STATUS | PIIX4_MEM_HOTPLUG_STATUS)) != 0); qemu_set_irq(s-irq, sci_level); /* schedule a timer interruption if needed */ @@ -499,7 +508,16 @@ type_init(piix4_pm_register_types) static uint32_t gpe_readb(void *opaque, uint32_t addr) { PIIX4PMState *s = opaque; -uint32_t val = acpi_gpe_ioport_readb(s-ar, addr); +uint32_t val = 0; +struct gpe_regs *g = s-gperegs; + +switch (addr) { +case MEM_BASE ... MEM_BASE+DIMM_BITMAP_BYTES: +val = g-mems_sts[addr - MEM_BASE]; +break; +default: +val = acpi_gpe_ioport_readb(s-ar, addr); +} PIIX4_DPRINTF(gpe read %x == %x\n, addr, val); return val; @@ -509,7 +527,13 @@ static void gpe_writeb(void *opaque, uint32_t addr, uint32_t val) { PIIX4PMState *s = opaque; -acpi_gpe_ioport_writeb(s-ar, addr, val); +switch (addr) { +case MEM_EJ_BASE: +dimm_notify(val, DIMM_REMOVE_SUCCESS); +break; +default: +acpi_gpe_ioport_writeb(s-ar, addr, val); +} pm_update_sci(s); PIIX4_DPRINTF(gpe write %x == %d\n, addr, val); @@ -560,9 +584,11 @@ static uint32_t pcirmv_read(void *opaque, uint32_t addr) static int piix4_device_hotplug(DeviceState *qdev, PCIDevice *dev, PCIHotplugState state); +static int piix4_dimm_hotplug(DeviceState *qdev, DimmDevice *dev, int add); static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s) { +int i = 0;
[RFC PATCH v3 05/19] Implement dimm device abstraction
Each hotplug-able memory slot is a DimmDevice. All DimmDevices are attached to a new bus called DimmBus. This bus is introduced so that we no longer depend on hotplug-capability of main system bus (the main bus does not allow hotplugging). The DimmBus should be attached to a chipset Device (i440fx in case of the pc) A hot-add operation for a particular dimm: - creates a new DimmDevice and attaches it to the DimmBus - creates a new MemoryRegion of the given physical address offset, size and node proximity, and attaches it to main system memory as a sub_region. A successful hot-remove operation detaches and frees the MemoryRegion from system memory, and removes the DimmDevice from the DimmBus. Hotplug operations are done through normal device_add /device_del commands. Also add properties to DimmDevice. Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com --- hw/dimm.c | 305 + hw/dimm.h | 90 ++ 2 files changed, 395 insertions(+), 0 deletions(-) create mode 100644 hw/dimm.c create mode 100644 hw/dimm.h diff --git a/hw/dimm.c b/hw/dimm.c new file mode 100644 index 000..288b997 --- /dev/null +++ b/hw/dimm.c @@ -0,0 +1,305 @@ +/* + * Dimm device for Memory Hotplug + * + * Copyright ProfitBricks GmbH 2012 + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see http://www.gnu.org/licenses/ + */ + +#include trace.h +#include qdev.h +#include dimm.h +#include time.h +#include ../exec-memory.h +#include qmp-commands.h + +/* the system-wide memory bus. */ +static DimmBus *main_memory_bus; +/* the following list is used to hold dimm config info before machine + * initialization. After machine init, the list is emptied and not used anymore.*/ +static DimmConfiglist dimmconfig_list = QTAILQ_HEAD_INITIALIZER(dimmconfig_list); + +static void dimmbus_dev_print(Monitor *mon, DeviceState *dev, int indent); +static char *dimmbus_get_fw_dev_path(DeviceState *dev); + +static Property dimm_properties[] = { +DEFINE_PROP_UINT64(start, DimmDevice, start, 0), +DEFINE_PROP_UINT64(size, DimmDevice, size, DEFAULT_DIMMSIZE), +DEFINE_PROP_UINT32(node, DimmDevice, node, 0), +DEFINE_PROP_END_OF_LIST(), +}; + +static void dimmbus_dev_print(Monitor *mon, DeviceState *dev, int indent) +{ +} + +static char *dimmbus_get_fw_dev_path(DeviceState *dev) +{ +char path[40]; + +snprintf(path, sizeof(path), %s, qdev_fw_name(dev)); +return strdup(path); +} + +static void dimm_bus_class_init(ObjectClass *klass, void *data) +{ +BusClass *k = BUS_CLASS(klass); + +k-print_dev = dimmbus_dev_print; +k-get_fw_dev_path = dimmbus_get_fw_dev_path; +} + +static void dimm_bus_initfn(Object *obj) +{ +DimmConfig *dimm_cfg, *next_dimm_cfg; +DimmBus *bus = DIMM_BUS(obj); +QTAILQ_INIT(bus-dimmconfig_list); +QTAILQ_INIT(bus-dimmlist); + +QTAILQ_FOREACH_SAFE(dimm_cfg, dimmconfig_list, nextdimmcfg, next_dimm_cfg) { +QTAILQ_REMOVE(dimmconfig_list, dimm_cfg, nextdimmcfg); +QTAILQ_INSERT_TAIL(bus-dimmconfig_list, dimm_cfg, nextdimmcfg); +} +} + +static const TypeInfo dimm_bus_info = { +.name = TYPE_DIMM_BUS, +.parent = TYPE_BUS, +.instance_size = sizeof(DimmBus), +.instance_init = dimm_bus_initfn, +.class_init = dimm_bus_class_init, +}; + +void main_memory_bus_create(Object *parent) +{ +main_memory_bus = g_malloc0(dimm_bus_info.instance_size); +main_memory_bus-qbus.glib_allocated = true; +qbus_create_inplace(main_memory_bus-qbus, TYPE_DIMM_BUS, DEVICE(parent), +membus); +} + +static void dimm_populate(DimmDevice *s) +{ +DeviceState *dev= (DeviceState*)s; +MemoryRegion *new = NULL; + +new = g_malloc(sizeof(MemoryRegion)); +memory_region_init_ram(new, dev-id, s-size); +vmstate_register_ram_global(new); +memory_region_add_subregion(get_system_memory(), s-start, new); +s-mr = new; +} + +static void dimm_depopulate(DimmDevice *s) +{ +assert(s); +vmstate_unregister_ram(s-mr, NULL); +memory_region_del_subregion(get_system_memory(), s-mr); +memory_region_destroy(s-mr); +s-mr = NULL; +} + +void dimm_config_create(char *id, uint64_t size, uint64_t node, uint32_t +dimm_idx, uint32_t populated) +{ +DimmConfig *dimm_cfg; +dimm_cfg = (DimmConfig*) g_malloc0(sizeof(DimmConfig)); +dimm_cfg-name = id;
[RFC PATCH v3 06/19] Implement -dimm command line option
Example: -dimm id=dimm0,size=512M,node=0,populated=off will define a 512M memory slot belonging to numa node 0. When populated=on, a DimmDevice is created and hot-plugged at system startup. Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com --- hw/Makefile.objs |2 +- qemu-config.c| 25 + qemu-options.hx |5 + sysemu.h |1 + vl.c | 50 ++ 5 files changed, 82 insertions(+), 1 deletions(-) diff --git a/hw/Makefile.objs b/hw/Makefile.objs index 6dfebd2..8c5c39a 100644 --- a/hw/Makefile.objs +++ b/hw/Makefile.objs @@ -26,7 +26,7 @@ hw-obj-$(CONFIG_I8254) += i8254_common.o i8254.o hw-obj-$(CONFIG_PCSPK) += pcspk.o hw-obj-$(CONFIG_PCKBD) += pckbd.o hw-obj-$(CONFIG_FDC) += fdc.o -hw-obj-$(CONFIG_ACPI) += acpi.o acpi_piix4.o +hw-obj-$(CONFIG_ACPI) += acpi.o acpi_piix4.o dimm.o hw-obj-$(CONFIG_APM) += pm_smbus.o apm.o hw-obj-$(CONFIG_DMA) += dma.o hw-obj-$(CONFIG_I82374) += i82374.o diff --git a/qemu-config.c b/qemu-config.c index eba977e..4022d64 100644 --- a/qemu-config.c +++ b/qemu-config.c @@ -646,6 +646,30 @@ QemuOptsList qemu_boot_opts = { }, }; +static QemuOptsList qemu_dimm_opts = { +.name = dimm, +.head = QTAILQ_HEAD_INITIALIZER(qemu_dimm_opts.head), +.desc = { +{ +.name = id, +.type = QEMU_OPT_STRING, +.help = id of this dimm device, +},{ +.name = size, +.type = QEMU_OPT_SIZE, +.help = memory size for this dimm, +},{ +.name = populated, +.type = QEMU_OPT_BOOL, +.help = populated for this dimm, +},{ +.name = node, +.type = QEMU_OPT_NUMBER, +.help = NUMA node number (i.e. proximity) for this dimm, +}, +{ /* end of list */ } +}, +}; static QemuOptsList *vm_config_groups[32] = { qemu_drive_opts, qemu_chardev_opts, @@ -662,6 +686,7 @@ static QemuOptsList *vm_config_groups[32] = { qemu_boot_opts, qemu_iscsi_opts, qemu_sandbox_opts, +qemu_dimm_opts, NULL, }; diff --git a/qemu-options.hx b/qemu-options.hx index 804a2d1..3687722 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -2842,3 +2842,8 @@ HXCOMM This is the last statement. Insert new options before this line! STEXI @end table ETEXI + +DEF(dimm, HAS_ARG, QEMU_OPTION_dimm, +-dimm id=dimmid,size=sz,node=nd,populated=on|off\n +specify memory dimm device with name dimmid, size sz on node nd, +QEMU_ARCH_ALL) diff --git a/sysemu.h b/sysemu.h index 65552ac..7baf9c9 100644 --- a/sysemu.h +++ b/sysemu.h @@ -139,6 +139,7 @@ extern QEMUClock *rtc_clock; extern int nb_numa_nodes; extern uint64_t node_mem[MAX_NODES]; extern unsigned long *node_cpumask[MAX_NODES]; +extern int nb_hp_dimms; #define MAX_OPTION_ROMS 16 typedef struct QEMUOptionRom { diff --git a/vl.c b/vl.c index 7c577fa..af1745c 100644 --- a/vl.c +++ b/vl.c @@ -126,6 +126,7 @@ int main(int argc, char **argv) #include hw/xen.h #include hw/qdev.h #include hw/loader.h +#include hw/dimm.h #include bt-host.h #include net.h #include net/slirp.h @@ -248,6 +249,7 @@ QTAILQ_HEAD(, FWBootEntry) fw_boot_order = QTAILQ_HEAD_INITIALIZER(fw_boot_order int nb_numa_nodes; uint64_t node_mem[MAX_NODES]; unsigned long *node_cpumask[MAX_NODES]; +int nb_hp_dimms; uint8_t qemu_uuid[16]; @@ -530,6 +532,37 @@ static void configure_rtc_date_offset(const char *startdate, int legacy) } } +static void configure_dimm(QemuOpts *opts) +{ +const char *id; +uint64_t size, node; +bool populated; +QemuOpts *devopts; +char buf[256]; +if (nb_hp_dimms == MAX_DIMMS) { +fprintf(stderr, qemu: maximum number of DIMMs (%d) exceeded\n, +MAX_DIMMS); +exit(1); +} +id = qemu_opts_id(opts); +size = qemu_opt_get_size(opts, size, DEFAULT_DIMMSIZE); +populated = qemu_opt_get_bool(opts, populated, 0); +node = qemu_opt_get_number(opts, node, 0); + +dimm_config_create((char*)id, size, node, nb_hp_dimms, 0); + +if (populated) { +devopts = qemu_opts_create(qemu_find_opts(device), id, 0, NULL); +qemu_opt_set(devopts, driver, dimm); +snprintf(buf, sizeof(buf), %lu, size); +qemu_opt_set(devopts, size, buf); +snprintf(buf, sizeof(buf), %lu, node); +qemu_opt_set(devopts, node, buf); +qemu_opt_set(devopts, bus, membus); +} +nb_hp_dimms++; +} + static void configure_rtc(QemuOpts *opts) { const char *value; @@ -2354,6 +2387,8 @@ int main(int argc, char **argv, char **envp) DisplayChangeListener *dcl; int cyls, heads, secs, translation; QemuOpts *hda_opts = NULL, *opts, *machine_opts; +QemuOpts *dimm_opts[MAX_DIMMS]; +int nb_dimm_opts = 0; QemuOptsList *olist; int optind; const char *optarg; @@ -3288,6 +3323,18 @@ int
[RFC PATCH v3 02/19][SeaBIOS] Add SSDT memory device support
Define SSDT hotplug-able memory devices in _SB namespace. The dynamically generated SSDT includes per memory device hotplug methods. These methods just call methods defined in the DSDT. Also dynamically generate a MTFY method and a MEON array of the online/available memory devices. ACPI extraction macros are used to place the AML code in variables later used by src/acpi. The design is taken from SSDT cpu generation. Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com --- Makefile |2 +- src/ssdt-mem.dsl | 65 ++ 2 files changed, 66 insertions(+), 1 deletions(-) create mode 100644 src/ssdt-mem.dsl diff --git a/Makefile b/Makefile index 5486f88..e82cfc9 100644 --- a/Makefile +++ b/Makefile @@ -233,7 +233,7 @@ $(OUT)%.hex: src/%.dsl ./tools/acpi_extract_preprocess.py ./tools/acpi_extract.p $(Q)$(PYTHON) ./tools/acpi_extract.py $(OUT)$*.lst $(OUT)$*.off $(Q)cat $(OUT)$*.off $@ -$(OUT)ccode32flat.o: $(OUT)acpi-dsdt.hex $(OUT)ssdt-proc.hex $(OUT)ssdt-pcihp.hex $(OUT)ssdt-susp.hex +$(OUT)ccode32flat.o: $(OUT)acpi-dsdt.hex $(OUT)ssdt-proc.hex $(OUT)ssdt-pcihp.hex $(OUT)ssdt-susp.hex $(OUT)ssdt-mem.hex Kconfig rules diff --git a/src/ssdt-mem.dsl b/src/ssdt-mem.dsl new file mode 100644 index 000..ee322f0 --- /dev/null +++ b/src/ssdt-mem.dsl @@ -0,0 +1,65 @@ +/* This file is the basis for the ssdt_mem[] variable in src/acpi.c. + * It is similar in design to the ssdt_proc variable. + * It defines the contents of the per-cpu Processor() object. At + * runtime, a dynamically generated SSDT will contain one copy of this + * AML snippet for every possible memory device in the system. The + * objects will * be placed in the \_SB_ namespace. + * + * In addition to the aml code generated from this file, the + * src/acpi.c file creates a MEMNTFY method with an entry for each memdevice: + * Method(MTFY, 2) { + * If (LEqual(Arg0, 0x00)) { Notify(MP00, Arg1) } + * If (LEqual(Arg0, 0x01)) { Notify(MP01, Arg1) } + * ... + * } + * and a MEON array with the list of active and inactive memory devices: + * Name(MEON, Package() { One, One, ..., Zero, Zero, ... }) + */ +ACPI_EXTRACT_ALL_CODE ssdm_mem_aml + +DefinitionBlock (ssdt-mem.aml, SSDT, 0x02, BXPC, CSSDT, 0x1) +/* v-- DO NOT EDIT --v */ +{ +ACPI_EXTRACT_DEVICE_START ssdt_mem_start +ACPI_EXTRACT_DEVICE_END ssdt_mem_end +ACPI_EXTRACT_DEVICE_STRING ssdt_mem_name +Device(MPAA) { +ACPI_EXTRACT_NAME_BYTE_CONST ssdt_mem_id +Name(ID, 0xAA) +/* ^-- DO NOT EDIT --^ + * + * The src/acpi.c code requires the above layout so that it can update + * MPAA and 0xAA with the appropriate MEMDEVICE id (see + * SD_OFFSET_MEMHEX/MEMID1/MEMID2). Don't change the above without + * also updating the C code. + */ +Name(_HID, EISAID(PNP0C80)) +Name(_PXM, 0xAA) + +External(CMST, MethodObj) +External(MPEJ, MethodObj) + +Name(_CRS, ResourceTemplate() { +QwordMemory( + ResourceConsumer, + , + MinFixed, + MaxFixed, + Cacheable, + ReadWrite, + 0x0, + 0xDEADBEEF, + 0xE6ADBEEE, + 0x, + 0x0800, + ) +}) +Method (_STA, 0) { +Return(CMST(ID)) +} +Method (_EJ0, 1, NotSerialized) { +MPEJ(ID, Arg0) +} +} +} + -- 1.7.9 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH v3 19/19] alternative: Introduce paravirt interface QEMU_CFG_PCI_WINDOW
Qemu already calculates the 32-bit and 64-bit PCI starting offsets based on initial memory and hotplug-able dimms. This info needs to be passed to Seabios for PCI initialization. Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com --- docs/specs/fwcfg.txt |9 + hw/fw_cfg.h |1 + hw/pc_piix.c | 10 ++ 3 files changed, 20 insertions(+), 0 deletions(-) diff --git a/docs/specs/fwcfg.txt b/docs/specs/fwcfg.txt index 55f96d9..d9fa215 100644 --- a/docs/specs/fwcfg.txt +++ b/docs/specs/fwcfg.txt @@ -26,3 +26,12 @@ Entry max_cpus+nb_numa_nodes+1 contains the number of memory dimms (nb_hp_dimms) The last 3 * nb_hp_dimms entries are organized in triplets: Each triplet contains the physical address offset, size (in bytes), and node proximity for the respective dimm. + +FW_CFG_PCI_WINDOW paravirt info + +QEMU passes the starting address for the 32-bit and 64-bit PCI windows to BIOS. +The following layouts are followed: + + +pcimem32_start | pcimem64_start | + diff --git a/hw/fw_cfg.h b/hw/fw_cfg.h index 856bf91..6c8c151 100644 --- a/hw/fw_cfg.h +++ b/hw/fw_cfg.h @@ -27,6 +27,7 @@ #define FW_CFG_SETUP_SIZE 0x17 #define FW_CFG_SETUP_DATA 0x18 #define FW_CFG_FILE_DIR 0x19 +#define FW_CFG_PCI_WINDOW 0x1a #define FW_CFG_FILE_FIRST 0x20 #define FW_CFG_FILE_SLOTS 0x10 diff --git a/hw/pc_piix.c b/hw/pc_piix.c index d1fd276..034761f 100644 --- a/hw/pc_piix.c +++ b/hw/pc_piix.c @@ -44,6 +44,7 @@ #include memory.h #include exec-memory.h #include dimm.h +#include fw_cfg.h #ifdef CONFIG_XEN # include xen/hvm/hvm_info_table.h #endif @@ -149,6 +150,7 @@ static void pc_init1(MemoryRegion *system_memory, MemoryRegion *pci_memory; MemoryRegion *rom_memory; void *fw_cfg = NULL; +uint64_t *pci_window_fw_cfg; pc_cpus_init(cpu_model); @@ -205,6 +207,14 @@ static void pc_init1(MemoryRegion *system_memory, ? 0 : ((uint64_t)1 62)), pci_memory, ram_memory); + +pci_window_fw_cfg = g_malloc0(2 * 8); +pci_window_fw_cfg[0] = cpu_to_le64(below_4g_mem_size + +below_4g_hp_mem_size); +pci_window_fw_cfg[1] = cpu_to_le64(0x1ULL + above_4g_mem_size ++ above_4g_hp_mem_size); +fw_cfg_add_bytes(fw_cfg, FW_CFG_PCI_WINDOW, +(uint8_t *)pci_window_fw_cfg, 2 * 8); } else { pci_bus = NULL; i440fx_state = NULL; -- 1.7.9 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH v3 20/19][SeaBIOS] alternative: Use paravirt interface for pci windows
Initialize the 32-bit and 64-bit pci starting offsets from values passed in by the qemu paravirt interface QEMU_CFG_PCI_WINDOW. Qemu calculates the starting offsets based on initial memory and hotplug-able dimms. Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com --- src/paravirt.c |6 ++ src/paravirt.h |2 ++ src/pciinit.c |5 ++--- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/paravirt.c b/src/paravirt.c index 2a98d53..390ef30 100644 --- a/src/paravirt.c +++ b/src/paravirt.c @@ -346,3 +346,9 @@ void qemu_cfg_romfile_setup(void) dprintf(3, Found fw_cfg file: %s (size=%d)\n, file-name, file-size); } } + +void qemu_cfg_get_pci_offsets(u64 *pcimem_start, u64 *pcimem64_start) +{ +qemu_cfg_read_entry(pcimem_start, QEMU_CFG_PCI_WINDOW, sizeof(u64)); +qemu_cfg_read((u8*)(pcimem64_start), sizeof(u64)); +} diff --git a/src/paravirt.h b/src/paravirt.h index a284c41..b53ff88 100644 --- a/src/paravirt.h +++ b/src/paravirt.h @@ -35,6 +35,7 @@ static inline int kvm_para_available(void) #define QEMU_CFG_BOOT_MENU 0x0e #define QEMU_CFG_MAX_CPUS 0x0f #define QEMU_CFG_FILE_DIR 0x19 +#define QEMU_CFG_PCI_WINDOW 0x1a #define QEMU_CFG_ARCH_LOCAL 0x8000 #define QEMU_CFG_ACPI_TABLES(QEMU_CFG_ARCH_LOCAL + 0) #define QEMU_CFG_SMBIOS_ENTRIES (QEMU_CFG_ARCH_LOCAL + 1) @@ -65,5 +66,6 @@ struct e820_reservation { u32 qemu_cfg_e820_entries(void); void* qemu_cfg_e820_load_next(void *addr); void qemu_cfg_romfile_setup(void); +void qemu_cfg_get_pci_offsets(u64 *pcimem_start, u64 *pcimem64_start); #endif diff --git a/src/pciinit.c b/src/pciinit.c index 68f302a..64468a0 100644 --- a/src/pciinit.c +++ b/src/pciinit.c @@ -592,8 +592,7 @@ static void pci_region_map_entries(struct pci_bus *busses, struct pci_region *r) static void pci_bios_map_devices(struct pci_bus *busses) { -pcimem_start = RamSize; - +qemu_cfg_get_pci_offsets(pcimem_start, pcimem64_start); if (pci_bios_init_root_regions(busses)) { struct pci_region r64_mem, r64_pref; r64_mem.list = NULL; @@ -611,7 +610,7 @@ static void pci_bios_map_devices(struct pci_bus *busses) u64 align_mem = pci_region_align(r64_mem); u64 align_pref = pci_region_align(r64_pref); -r64_mem.base = ALIGN(0x1LL + RamSizeOver4G, align_mem); +r64_mem.base = ALIGN(pcimem64_start, align_mem); r64_pref.base = ALIGN(r64_mem.base + sum_mem, align_pref); pcimem64_start = r64_mem.base; pcimem64_end = r64_pref.base + sum_pref; -- 1.7.9 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH RFC 0/2] kvm: Improving undercommit,overcommit scenarios in PLE handler
In some special scenarios like #vcpu = #pcpu, PLE handler may prove very costly, because there is no need to iterate over vcpus and do unsuccessful yield_to burning CPU. An idea to solve this is: 1) As Avi had proposed we can modify hardware ple_window dynamically to avoid frequent PL-exit. (IMHO, it is difficult to decide when we have mixed type of VMs). Another idea, proposed in the first patch, is to identify non-overcommit case and just return from the PLE handler. There are are many ways to identify non-overcommit scenario. 1) Using loadavg etc (get_avenrun/calc_global_load /this_cpu_load) 2) Explicitly check nr_running()/num_online_cpus() 3) Check source vcpu runqueue length. Not sure how can we make use of (1) effectively/how to use it. (2) has significant overhead since it iterates all cpus. so this patch uses third method. (I feel it is uglier to export runqueue length, but expecting suggestion on this). In second patch, when we have large number of small guests, it is possible that a spinning vcpu fails to yield_to any vcpu of same VM and go back and spin. This is also not effective when we are over-committed. Instead, we do a schedule() so that we give chance to other VMs to run. Raghavendra K T(2): Handle undercommitted guest case in PLE handler Be courteous to other VMs in overcommitted scenario in PLE handler Results: base = 3.6.0-rc5 + ple handler optimization patches from kvm tree. patched = base + patch1 + patch2 machine: x240 with 16 core with HT enabled (32 cpu thread). 32 vcpu guest with 8GB RAM. +---+---+---++---+ ebizzy (record/sec higher is better) +---+---+---++---+ basestddev patchedstdev%improve +---+---+---++---+ 11293.3750 624.4378 18209.6250 371.706161.24166 3641.8750 468.94003725.5000 253.7823 2.29621 +---+---+---++---+ +---+---+---++---+ kernbench (time in sec lower is better) +---+---+---++---+ basestddev patchedstdev%improve +---+---+---++---+ 30.6020 1.3018 30.8287 1.1517-0.74080 64.0825 2.3764 63.4721 5.0191 0.95252 95.8638 8.7030 94.5988 8.3832 1.31958 +---+---+---++---+ Note: on mx3850x5 machine with 32 cores HT disabled I got around ebizzy 209% kernbench 6% improvement for 1x scenario. Thanks Srikar for his active partipation in discussing ideas and reviewing the patch. Please let me know your suggestions and comments. --- include/linux/sched.h |1 + kernel/sched/core.c |6 ++ virt/kvm/kvm_main.c |7 +++ 3 files changed, 14 insertions(+), 0 deletions(-) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH RFC 1/2] kvm: Handle undercommitted guest case in PLE handler
From: Raghavendra K T raghavendra...@linux.vnet.ibm.com When total number of VCPUs of system is less than or equal to physical CPUs, PLE exits become costly since each VCPU can have dedicated PCPU, and trying to find a target VCPU to yield_to just burns time in PLE handler. This patch reduces overhead, by simply doing a return in such scenarios by checking the length of current cpu runqueue. Reviewed-by: Srikar Dronamraju sri...@linux.vnet.ibm.com Signed-off-by: Raghavendra K T raghavendra...@linux.vnet.ibm.com --- include/linux/sched.h |1 + kernel/sched/core.c |6 ++ virt/kvm/kvm_main.c |3 +++ 3 files changed, 10 insertions(+), 0 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index b8c8664..3645458 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -138,6 +138,7 @@ extern int nr_threads; DECLARE_PER_CPU(unsigned long, process_counts); extern int nr_processes(void); extern unsigned long nr_running(void); +extern unsigned long rq_nr_running(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_iowait(void); extern unsigned long nr_iowait_cpu(int cpu); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fbf1fd0..2170b81 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4820,6 +4820,12 @@ void __sched yield(void) } EXPORT_SYMBOL(yield); +unsigned long rq_nr_running(void) +{ + return this_rq()-nr_running; +} +EXPORT_SYMBOL(rq_nr_running); + /** * yield_to - yield the current processor to another thread in * your thread group, or accelerate that thread toward the diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 28f00bc..8323685 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1629,6 +1629,9 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) int pass; int i; + if (unlikely(rq_nr_running() == 1)) + return; + kvm_vcpu_set_in_spin_loop(me, true); /* * We boost the priority of a VCPU that is runnable but not -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH RFC 2/2] kvm: Be courteous to other VMs in overcommitted scenario in PLE handler
From: Raghavendra K T raghavendra...@linux.vnet.ibm.com When PLE handler fails to find a better candidate to yield_to, it goes back and does spin again. This is acceptable when we do not have overcommit. But in overcommitted scenarios (especially when we have large number of small guests), it is better to yield. Reviewed-by: Srikar Dronamraju sri...@linux.vnet.ibm.com Signed-off-by: Raghavendra K T raghavendra...@linux.vnet.ibm.com --- virt/kvm/kvm_main.c |4 1 files changed, 4 insertions(+), 0 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8323685..713b677 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1660,6 +1660,10 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) } } } + /* In overcommitted cases, yield instead of spinning */ + if (!yielded rq_nr_running() 1) + schedule(); + kvm_vcpu_set_in_spin_loop(me, false); /* Ensure vcpu is not eligible during next spinloop */ -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [KVM] Guest Debugging Facilities in KVM
On 2012-09-20 19:17, Dean Pucsek wrote: On 2012-09-19, at 7:45 AM, Jan Kiszka jan.kis...@siemens.com wrote: On 2012-09-19 16:38, Avi Kivity wrote: On 09/17/2012 10:36 PM, Dean Pucsek wrote: Hello, For my Masters thesis I am investigating the usage of Intel VT-x and branch tracing in the domain of malware analysis. Essentially what I'm aiming to do is trace the execution of a guest VM and then pass that trace on to some other tools. I've been playing KVM for a couple weeks now but from comments such as (in arch/x86/kvm/vmx.c): /* * Forward all other exceptions that are valid in real mode. * FIXME: Breaks guest debugging in real mode, needs to be fixed with *the required debugging infrastructure rework. */ And (from an email sent to the list in July 2008): Note that guest debugging in real mode is broken now. This has to be fixed by the scheduled debugging infrastructure rework (will be done once base patches for QEMU have been accepted). it is unclear to me how much support there is for guest debugging in KVM currently (I wasn't able to find any recent documentation on it) and what the debugging infrastructure referred to by these comments is. I am interested in becoming involved with the KVM project in this respect however some guidance and direction on the guest debugging facilities would be greatly appreciated. Guest debugging works (but not in real mode due to the issue above). That doesn't apply to CPUs with Unrestricted Guest support, right? At least I didn't notice any limitations recently. [I did notice some other corner-case issue with guest debugging, still need to dig into that...] You can set hardware and software breakpoints and kvm will forward them to userspace, and from there to the debugger. I'll be happy to help, as I'm sure Jan (as the author of most of the guest debugging code) will as well. Is there a roadmap or plan for how the KVM project envisions the debugging facilities evolving? KVM and QEMU are in a pretty good shape now for kernel debugging on x86 - given current boundary conditions. Still we need to do something because gdb for x86 is not well prepared for system-level debugging. And those changes will requires some extensions of QEMU in turn. There are some ideas and early code to add gdb tracepoint support to QEMU and, possibly, to KVM (as acceleration). Jan -- Siemens AG, Corporate Technology, CT RTC ITP SDP-DE Corporate Competence Center Embedded Linux -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 02/17] target-i386: Add missing kvm bits.
On Fri, Sep 21, 2012 at 10:39:52AM +0200, Igor Mammedov wrote: On Thu, 20 Sep 2012 16:03:17 -0400 Don Slutz d...@cloudswitch.com wrote: Fix duplicate name (kvmclock = kvm_clock2) also. Signed-off-by: Don Slutz d...@cloudswitch.com --- target-i386/cpu.c | 12 1 files changed, 8 insertions(+), 4 deletions(-) diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 0313cf5..5f9866a 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -87,10 +87,14 @@ static const char *ext3_feature_name[] = { }; static const char *kvm_feature_name[] = { -kvmclock, kvm_nopiodelay, kvm_mmu, kvmclock, kvm_asyncpf, NULL, kvm_pv_eoi, NULL, -NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, -NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, -NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +kvmclock, kvm_nopiodelay, kvm_mmu, kvm_clock2, before patch if kvmclock is specified it would set 0 and 3 bits, after patch only bit 0 is set. Is it correct/expected behavior? if yes, please add rationale into patch description. The problem here seems to be: - It would be interesting to make kvmclock=true enough to enable the optimal behavior, instead of requiring users to use kvm_clock2=true explicitly - We need to allow older machine-types to be backwards compatible (not enabling the second bit by default), so we need a separate property to control the second bit. I think this is best modelled this way: - Having two separate properties: kvmclock and kvmclock2 (or kvm_clock2) - Older machine-types would have kvmclock2 default to false. Newer machine-types would kvmclock2 default to true. - kvmclock=false would disable both bits Then: - kvmclock=false would not set any bit (it would be surprising to have kvmclock=false but still have kvmclock enabled) - kvmclock=true would keep compatible behavior on older machine-types, (only the first bit set), but would get optimal behavior on newer machine-types (both bits set) - kvmclock=true,kvmclock2=true would set both bits - kvmclock=true,kvmclock2=false would set only the first bit It wouldn't be a direct mapping between properties and CPUID bits, but that's exactly the point. In this case, exposing individual CPUID bits directly is a too low-level interface. +kvm_asyncpf, kvm_steal_time, kvm_pv_eoi, NULL, +NULL, NULL, NULL, NULL, +NULL, NULL, NULL, NULL, +NULL, NULL, NULL, NULL, +NULL, NULL, NULL, NULL, +kvm_clock_stable, NULL, NULL, NULL, +NULL, NULL, NULL, NULL, }; static const char *svm_feature_name[] = { -- 1.7.1 -- Regards, Igor -- Eduardo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC 1/2] kvm: Handle undercommitted guest case in PLE handler
On 09/21/2012 08:00 AM, Raghavendra K T wrote: From: Raghavendra K T raghavendra...@linux.vnet.ibm.com When total number of VCPUs of system is less than or equal to physical CPUs, PLE exits become costly since each VCPU can have dedicated PCPU, and trying to find a target VCPU to yield_to just burns time in PLE handler. This patch reduces overhead, by simply doing a return in such scenarios by checking the length of current cpu runqueue. I am not convinced this is the way to go. The VCPU that is holding the lock, and is not releasing it, probably got scheduled out. That implies that VCPU is on a runqueue with at least one other task. --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1629,6 +1629,9 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) int pass; int i; + if (unlikely(rq_nr_running() == 1)) + return; + kvm_vcpu_set_in_spin_loop(me, true); /* * We boost the priority of a VCPU that is runnable but not -- All rights reversed -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 02/17] target-i386: Add missing kvm bits.
On 09/21/12 08:36, Eduardo Habkost wrote: On Fri, Sep 21, 2012 at 10:39:52AM +0200, Igor Mammedov wrote: On Thu, 20 Sep 2012 16:03:17 -0400 Don Slutz d...@cloudswitch.com wrote: Fix duplicate name (kvmclock = kvm_clock2) also. Signed-off-by: Don Slutz d...@cloudswitch.com --- target-i386/cpu.c | 12 1 files changed, 8 insertions(+), 4 deletions(-) diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 0313cf5..5f9866a 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -87,10 +87,14 @@ static const char *ext3_feature_name[] = { }; static const char *kvm_feature_name[] = { -kvmclock, kvm_nopiodelay, kvm_mmu, kvmclock, kvm_asyncpf, NULL, kvm_pv_eoi, NULL, -NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, -NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, -NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +kvmclock, kvm_nopiodelay, kvm_mmu, kvm_clock2, before patch if kvmclock is specified it would set 0 and 3 bits, after patch only bit 0 is set. Is it correct/expected behavior? if yes, please add rationale into patch description. This is not what I had intended. The problem here seems to be: - It would be interesting to make kvmclock=true enough to enable the optimal behavior, instead of requiring users to use kvm_clock2=true explicitly - We need to allow older machine-types to be backwards compatible (not enabling the second bit by default), so we need a separate property to control the second bit. I think this is best modelled this way: - Having two separate properties: kvmclock and kvmclock2 (or kvm_clock2) - Older machine-types would have kvmclock2 default to false. Newer machine-types would kvmclock2 default to true. - kvmclock=false would disable both bits Then: - kvmclock=false would not set any bit (it would be surprising to have kvmclock=false but still have kvmclock enabled) - kvmclock=true would keep compatible behavior on older machine-types, (only the first bit set), but would get optimal behavior on newer machine-types (both bits set) - kvmclock=true,kvmclock2=true would set both bits - kvmclock=true,kvmclock2=false would set only the first bit It wouldn't be a direct mapping between properties and CPUID bits, but that's exactly the point. In this case, exposing individual CPUID bits directly is a too low-level interface. This does look much better. For the sake of simple changes, this patch will be changed so that -kvmclock (kvmclock=false) will continue to clear both bits. I will look into the right way to fit this into the newer cpu model. +kvm_asyncpf, kvm_steal_time, kvm_pv_eoi, NULL, +NULL, NULL, NULL, NULL, +NULL, NULL, NULL, NULL, +NULL, NULL, NULL, NULL, +NULL, NULL, NULL, NULL, +kvm_clock_stable, NULL, NULL, NULL, +NULL, NULL, NULL, NULL, }; static const char *svm_feature_name[] = { -- 1.7.1 -- Regards, Igor -Don Slutz -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC 0/2] kvm: Improving undercommit,overcommit scenarios in PLE handler
On 9/21/2012 4:59 AM, Raghavendra K T wrote: In some special scenarios like #vcpu = #pcpu, PLE handler may prove very costly, Yes. because there is no need to iterate over vcpus and do unsuccessful yield_to burning CPU. An idea to solve this is: 1) As Avi had proposed we can modify hardware ple_window dynamically to avoid frequent PL-exit. Yes. We had to do this to get around some scaling issues for large (20way) guests (with no overcommitment) As part of some experimentation we even tried switching off PLE too :( (IMHO, it is difficult to decide when we have mixed type of VMs). Agree. Not sure if the following alternatives have also been looked at : - Could the behavior associated with the ple_window be modified to be a function of some [new] per-guest attribute (which can be conveyed to the host as part of the guest launch sequence). The user can choose to set this [new] attribute for a given guest. This would help avoid the frequent exits due to PLE (as Avi had mentioned earlier) ? - Can the PLE feature ( in VT) be enhanced to be made a per guest attribute ? IMHO, the approach of not taking a frequent exit is better than taking an exit and returning back from the handler etc. Thanks Vinod Another idea, proposed in the first patch, is to identify non-overcommit case and just return from the PLE handler. There are are many ways to identify non-overcommit scenario. 1) Using loadavg etc (get_avenrun/calc_global_load /this_cpu_load) 2) Explicitly check nr_running()/num_online_cpus() 3) Check source vcpu runqueue length. Not sure how can we make use of (1) effectively/how to use it. (2) has significant overhead since it iterates all cpus. so this patch uses third method. (I feel it is uglier to export runqueue length, but expecting suggestion on this). In second patch, when we have large number of small guests, it is possible that a spinning vcpu fails to yield_to any vcpu of same VM and go back and spin. This is also not effective when we are over-committed. Instead, we do a schedule() so that we give chance to other VMs to run. Raghavendra K T(2): Handle undercommitted guest case in PLE handler Be courteous to other VMs in overcommitted scenario in PLE handler Results: base = 3.6.0-rc5 + ple handler optimization patches from kvm tree. patched = base + patch1 + patch2 machine: x240 with 16 core with HT enabled (32 cpu thread). 32 vcpu guest with 8GB RAM. +---+---+---++---+ ebizzy (record/sec higher is better) +---+---+---++---+ basestddev patchedstdev%improve +---+---+---++---+ 11293.3750 624.4378 18209.6250 371.706161.24166 3641.8750 468.9400 3725.5000 253.7823 2.29621 +---+---+---++---+ +---+---+---++---+ kernbench (time in sec lower is better) +---+---+---++---+ basestddev patchedstdev%improve +---+---+---++---+ 30.6020 1.3018 30.8287 1.1517-0.74080 64.0825 2.3764 63.4721 5.0191 0.95252 95.8638 8.7030 94.5988 8.3832 1.31958 +---+---+---++---+ Note: on mx3850x5 machine with 32 cores HT disabled I got around ebizzy 209% kernbench 6% improvement for 1x scenario. Thanks Srikar for his active partipation in discussing ideas and reviewing the patch. Please let me know your suggestions and comments. --- include/linux/sched.h |1 + kernel/sched/core.c |6 ++ virt/kvm/kvm_main.c |7 +++ 3 files changed, 14 insertions(+), 0 deletions(-) . -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC 2/2] kvm: Be courteous to other VMs in overcommitted scenario in PLE handler
On 09/21/2012 08:00 AM, Raghavendra K T wrote: From: Raghavendra K T raghavendra...@linux.vnet.ibm.com When PLE handler fails to find a better candidate to yield_to, it goes back and does spin again. This is acceptable when we do not have overcommit. But in overcommitted scenarios (especially when we have large number of small guests), it is better to yield. Reviewed-by: Srikar Dronamraju sri...@linux.vnet.ibm.com Signed-off-by: Raghavendra K T raghavendra...@linux.vnet.ibm.com Acked-by: Rik van Riel r...@redhat.com -- All rights reversed -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC 2/2] kvm: Be courteous to other VMs in overcommitted scenario in PLE handler
On Fri, 21 Sep 2012 17:30:20 +0530 Raghavendra K T raghavendra...@linux.vnet.ibm.com wrote: From: Raghavendra K T raghavendra...@linux.vnet.ibm.com When PLE handler fails to find a better candidate to yield_to, it goes back and does spin again. This is acceptable when we do not have overcommit. But in overcommitted scenarios (especially when we have large number of small guests), it is better to yield. Reviewed-by: Srikar Dronamraju sri...@linux.vnet.ibm.com Signed-off-by: Raghavendra K T raghavendra...@linux.vnet.ibm.com --- virt/kvm/kvm_main.c |4 1 files changed, 4 insertions(+), 0 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8323685..713b677 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1660,6 +1660,10 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) } } } + /* In overcommitted cases, yield instead of spinning */ + if (!yielded rq_nr_running() 1) + schedule(); How about doing cond_resched() instead? I'm not sure whether checking more sched stuff in KVM code is a good thing. Takuya -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/9] Reduce compaction scanning and lock contention
On 09/21/2012 06:46 AM, Mel Gorman wrote: Hi Andrew, Richard Davies and Shaohua Li have both reported lock contention problems in compaction on the zone and LRU locks as well as significant amounts of time being spent in compaction. This series aims to reduce lock contention and scanning rates to reduce that CPU usage. Richard reported at https://lkml.org/lkml/2012/9/21/91 that this series made a big different to a problem he reported in August (http://marc.info/?l=kvmm=134511507015614w=2). One way or the other, this series has a large impact on the amount of scanning compaction does when there is a storm of THP allocations. Andrew, Mel and I have discussed the stuff in this series quite a bit, and I am convinced this is the way forward with compaction. -- All rights reversed -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC 2/2] kvm: Be courteous to other VMs in overcommitted scenario in PLE handler
On 09/21/2012 09:46 AM, Takuya Yoshikawa wrote: On Fri, 21 Sep 2012 17:30:20 +0530 Raghavendra K T raghavendra...@linux.vnet.ibm.com wrote: From: Raghavendra K T raghavendra...@linux.vnet.ibm.com When PLE handler fails to find a better candidate to yield_to, it goes back and does spin again. This is acceptable when we do not have overcommit. But in overcommitted scenarios (especially when we have large number of small guests), it is better to yield. Reviewed-by: Srikar Dronamraju sri...@linux.vnet.ibm.com Signed-off-by: Raghavendra K T raghavendra...@linux.vnet.ibm.com --- virt/kvm/kvm_main.c |4 1 files changed, 4 insertions(+), 0 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8323685..713b677 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1660,6 +1660,10 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) } } } + /* In overcommitted cases, yield instead of spinning */ + if (!yielded rq_nr_running() 1) + schedule(); How about doing cond_resched() instead? Actually, an actual call to yield() may be better. That will set scheduler hints to make the scheduler pick another task for one round, while preserving this task's top position in the runqueue. -- All rights reversed -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 1/1] target-i386: Fix default Hypervisor level for hypervisor-vendor=kvm.
On Thu, Sep 20, 2012 at 04:06:27PM -0400, Don Slutz wrote: From http://lkml.indiana.edu/hypermail/linux/kernel/1205.0/00100.html EAX should be KVM_CPUID_FEATURES (0x4001) not 0. Added hypervisor-vendor=kvm0 to get the older CPUID result. kvm1 selects the newer one. Why not just make hypervisor-vendor=kvm control only the hypervisor vendor string, and support something like kvm-hypervisor-level=0 to restore the old cpuid_hv_level=0 behavior? This is similar to the kvmclock case: it would allow us to make hypervisor-vendor=kvm use saner values as default, but letting old machine-types to override it for compatibility if required. Signed-off-by: Don Slutz d...@cloudswitch.com --- target-i386/cpu.c | 17 + 1 files changed, 13 insertions(+), 4 deletions(-) diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 72a8442..e51b2b1 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -1250,9 +1250,12 @@ static char *x86_cpuid_get_hv_vendor(Object *obj, Error **errp) } else if (!strcmp(value, CPUID_HV_VENDOR_XEN) env-cpuid_hv_level == CPUID_HV_LEVEL_XEN) { pstrcpy(value, sizeof(value), xen); -} else if (!strcmp(value, CPUID_HV_VENDOR_KVM) - env-cpuid_hv_level == 0) { -pstrcpy(value, sizeof(value), kvm); +} else if (!strcmp(value, CPUID_HV_VENDOR_KVM)) { +if (env-cpuid_hv_level == CPUID_HV_LEVEL_KVM) { +pstrcpy(value, sizeof(value), kvm1); +} else if (env-cpuid_hv_level == 0) { +pstrcpy(value, sizeof(value), kvm0); +} } return value; } @@ -1288,7 +1291,13 @@ static void x86_cpuid_set_hv_vendor(Object *obj, const char *value, env-cpuid_hv_level = CPUID_HV_LEVEL_XEN; } pstrcpy(adj_value, sizeof(adj_value), CPUID_HV_VENDOR_XEN); -} else if (!strcmp(value, kvm)) { +} else if (!strcmp(value, kvm) || !strcmp(value, kvm1)) { +if (env-cpuid_hv_level == 0) { +env-cpuid_hv_level = CPUID_HV_LEVEL_KVM; +} +pstrcpy(adj_value, sizeof(adj_value), CPUID_HV_VENDOR_KVM); +} else if (!strcmp(value, kvm0)) { +env-cpuid_hv_level = 0; pstrcpy(adj_value, sizeof(adj_value), CPUID_HV_VENDOR_KVM); } else { pstrcpy(adj_value, sizeof(adj_value), value); -- 1.7.1 -- Eduardo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: pci-assign terminates the guest upon pread() / pwrite() error?
On 09/20/2012 05:13 PM, Alex Williamson wrote: On Thu, 2012-09-20 at 16:36 -0400, Etienne Martineau wrote: On 09/20/2012 03:37 PM, Alex Williamson wrote: On Thu, 2012-09-20 at 15:08 -0400, Etienne Martineau wrote: On 09/20/2012 02:16 PM, Alex Williamson wrote: On Thu, 2012-09-20 at 13:27 -0400, Etienne Martineau wrote: In hw/kvm/pci-assign.c a pread() error part of assigned_dev_pci_read() result in a hw_error(). Similarly a pwrite() error part of assigned_dev_pci_write() also result in a hw_error(). Would there be a way to avoid terminating the guest for those cases? How about we deassign the device upon error? By terminating the guest we contain the error vs allowing the guest to continue running with invalid data. De-assigning the device is asynchronous and relies on guest involvement, so damage is potentially already done. Is this a theoretical problem or do you actually have hardware that hits this? Thanks, Alex This problem is in the context of a Hot-pluggable device assigned to the guest. If the guest rd/wr the config space at the same time than the device is physically taken out then the guest will terminate with hw_error(). Because this limits the availability of the guest I think we should try to recover instead. I don't see what other damage can happen since guest's MMIO access to the stale device will go nowhere? So you're looking at implementing surprise device removal? There's not just config space, there's slow bar access and mmap'd spaces to worry about too. What does going nowhere mean? If it means reads return -1 and the guest is trying to read the data portion of a packet from the network or an hba, we've now passed bad data to the guest. Thanks, Alex Thanks for your answer; Yes we are doing 'surprise device removal' for assigned device. Note that the problem also exist with standard 'attention button' device removal. The problem is all about fault isolation. Ideally, only the corresponding driver should be affected by this 'surprise device removal'. I think that taking down the guest is too coarse. Think about a 'surprise device removal' on the host. In that case the host is not taken down so why not do the same with the guest? It depends on the host hardware. Some x86 hardware will try to isolate the fault with an NMI other architectures such as ia64 would pull a machine check on a driver access to unresponsive devices. Yes some badness will be latched into the guest but really this not any different that having a mis-behaving device. ... which is a bad thing, but often undetectable. This is detectable. Thanks, Alex Our hardware is throwing a surprise link down PCIe AER and we are acting on it. I agree that for the generalized case NMI can be an issue. Let me ask you that question. What would be the best way to support device removal (surprise or not) for guest assigned device then? How about signaling the guest from vfio_pci_remove()? thanks, Etienne -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: pci-assign terminates the guest upon pread() / pwrite() error?
On Fri, 2012-09-21 at 11:17 -0400, Etienne Martineau wrote: On 09/20/2012 05:13 PM, Alex Williamson wrote: On Thu, 2012-09-20 at 16:36 -0400, Etienne Martineau wrote: On 09/20/2012 03:37 PM, Alex Williamson wrote: On Thu, 2012-09-20 at 15:08 -0400, Etienne Martineau wrote: On 09/20/2012 02:16 PM, Alex Williamson wrote: On Thu, 2012-09-20 at 13:27 -0400, Etienne Martineau wrote: In hw/kvm/pci-assign.c a pread() error part of assigned_dev_pci_read() result in a hw_error(). Similarly a pwrite() error part of assigned_dev_pci_write() also result in a hw_error(). Would there be a way to avoid terminating the guest for those cases? How about we deassign the device upon error? By terminating the guest we contain the error vs allowing the guest to continue running with invalid data. De-assigning the device is asynchronous and relies on guest involvement, so damage is potentially already done. Is this a theoretical problem or do you actually have hardware that hits this? Thanks, Alex This problem is in the context of a Hot-pluggable device assigned to the guest. If the guest rd/wr the config space at the same time than the device is physically taken out then the guest will terminate with hw_error(). Because this limits the availability of the guest I think we should try to recover instead. I don't see what other damage can happen since guest's MMIO access to the stale device will go nowhere? So you're looking at implementing surprise device removal? There's not just config space, there's slow bar access and mmap'd spaces to worry about too. What does going nowhere mean? If it means reads return -1 and the guest is trying to read the data portion of a packet from the network or an hba, we've now passed bad data to the guest. Thanks, Alex Thanks for your answer; Yes we are doing 'surprise device removal' for assigned device. Note that the problem also exist with standard 'attention button' device removal. The problem is all about fault isolation. Ideally, only the corresponding driver should be affected by this 'surprise device removal'. I think that taking down the guest is too coarse. Think about a 'surprise device removal' on the host. In that case the host is not taken down so why not do the same with the guest? It depends on the host hardware. Some x86 hardware will try to isolate the fault with an NMI other architectures such as ia64 would pull a machine check on a driver access to unresponsive devices. Yes some badness will be latched into the guest but really this not any different that having a mis-behaving device. ... which is a bad thing, but often undetectable. This is detectable. Thanks, Alex Our hardware is throwing a surprise link down PCIe AER and we are acting on it. I agree that for the generalized case NMI can be an issue. Let me ask you that question. What would be the best way to support device removal (surprise or not) for guest assigned device then? How about signaling the guest from vfio_pci_remove()? Thanks for using vfio! :) The 440fx chipset is really not designed to deal with these kinds of problems. Generally the best answer to how should we expose foo to the guest is to do it exactly like it is on the host. That means sending a surprise link down aer to the guest. That should be possible with q35. We could potentially signal that in vfio_pci_remove, but we probably want to figure out how to relay the aer event to the guest and inject it into the emulated chipset. Thanks, Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Xen-users] Recommendations for Virtulization Hardware
(CC'ing Casey on this, as I recommend his setup for an Intel-based solution) Hello ShadesOfGrey, Hehehe, talk about timing ;) On Sep 20, 2012, at 5:15 PM, ShadesOfGrey shades_of_g...@earthlink.net wrote: I'm looking to build a new personal computer. I want it to function as a Linux desktop, provide network services for my home, and lastly, occasional Windows gaming. From what I've gathered, virtualization using a Type 1 Hypervisor supporting PCI/VGA pass-through like KVM or Xen would be an attractive solution for my needs. For reference, reading these threads on Ars Technica may be helpful to understand where I'm coming from, http://arstechnica.com/civis/viewtopic.php?f=6t=1175674 and http://arstechnica.com/civis/viewtopic.php?f=11t=1181867. But basically, I use Linux as my primary OS and would rather avoid dual booting or building two boxes just to play Windows games when I want to play Windows games. I'm also intrigued by the concept of virtualization and would like to experiment with it as a solution for my case. My problem is isolating which hardware to choose, specifically which combination of CPU, motherboard and video card. Previously I had been relying on web searches to glean information from gaming and enthusiast web sites and tech specs from motherboard manufacturers. After what I learned during my participation in the referenced threads at Ars Technica, I find myself back at square one. Instead of trying to guess what hardware support KVM Xen, and vice versa. I'd like to know what hardware KVM Xen users are actually using to run KVM Xen? Particularly with consideration for 3D gaming and current generation hardware, BTW. If there is need for further clarification, I'll answer any queries you might have. ___ Xen-users mailing list xen-us...@lists.xen.org http://lists.xen.org/xen-users As one of the folks on the list who has done this---probably to the most extreme degree---I can tell you it's good stuff. It brings the joys of datacenter consolidation to your gaming desktop, and also to your wallet ;) While my setup is now slightly dated, the 990FX chipset is still at the top of the AMD offering, so you can shop around on CPUs, and buy a cheaper secondary USB controller if you're not looking to cram in a 4-to-1 ratio. I've never had much success passing through the onboard USB from an AMD system, so I highly recommend picking up a little PCIe x1 controller at the least. That said, I'm convinced that highpoint has one of the coincidentally-best products on the market for people looking to do this, but I digress! Take a look at, specifically, this post I made to the list some months back, and I'll follow with some errata: http://lists.xen.org/archives/html/xen-users/2012-05/msg00328.html First, I've tested all of the hardware in the build that I recommended, and indeed ended up building a four-headed unit. It works like magic. Came in handy a few weeks ago when several of my friends and I piled into a couple cars for a vacation where we wanted to play games (yup, we're total nerds), but we couldn't fit four desktop cases in addition to our stuff in the cars. :) Second, by the time I got around to building it, the Antec One Hundred wasn't available. Finding a case that supports 8 expansion slots is a tough thing, but I found another similarly priced one, and it was a dream to build. I recommend it highly if you think you may want to max out your slots and/or go deeper down the rabbit hole with consolidated desktops: http://www.newegg.com/Product/Product.aspx?Item=N82E1682238 Aside from being a very solid case for the price point (good features for screwless installation as well), to give you an idea of the size, it is laid out in such a way that I could fit dual-GPU cards in it (Radeon 5970s). I ultimately had to remove the HDD mounts to pull it off, but you shouldn't have that problem... Mostly because AMDs dual GPU cards won't work for this, so don't buy one for this build. It's a problem with the PCIe switching hardware (well, the firmware thereof, probably) that they use. I'll save you the rambling, but let's just say that it should work, but doesn't :( Also, the case does look good! ;) Finally, and this is unfortunate, for the AMD build, *I* recommend you use ESXi. While Xen _does_ work with the hardware that I've listed, I've never been able to get the VMs to work properly with the GPLPV drivers, and these are crucial to performance. I really, really want to bring this project back up on Xen though, and will try again now that 4.2 has gone RTM. If you aren't buying anytime soon and would like to hit me up in a few weeks, by all means drop me a line, and I'll let you know if I've gotten around to it. So, for the Intel route! Casey DeLorme has, just this week, posted a fantastic set of detailed videos and documentation on his setup, where he
[PATCH] vfio: Fix virqfd release race
vfoi-pci supports a mechanism like KVM's irqfd for unmasking an interrupt through an eventfd. There are two ways to shutdown this interface: 1) close the eventfd, 2) ioctl (such as disabling the interrupt). Both of these do the release through a workqueue, which can result in a segfault if two jobs get queued for the same virqfd. Fix this by protecting the pointer to these virqfds by a spinlock. The vfio pci device will therefore no longer have a reference to it once the release job is queued under lock. On the ioctl side, we still flush the workqueue to ensure that any outstanding releases are completed. Signed-off-by: Alex Williamson alex.william...@redhat.com --- drivers/vfio/pci/vfio_pci_intrs.c | 76 +++-- 1 file changed, 56 insertions(+), 20 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 211a492..d8dedc7 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -76,9 +76,24 @@ static int virqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) schedule_work(virqfd-inject); } - if (flags POLLHUP) - /* The eventfd is closing, detach from VFIO */ - virqfd_deactivate(virqfd); + if (flags POLLHUP) { + unsigned long flags; + spin_lock_irqsave(virqfd-vdev-irqlock, flags); + + /* +* The eventfd is closing, if the virqfd has not yet been +* queued for release, as determined by testing whether the +* vdev pointer to it is still valid, queue it now. As +* with kvm irqfds, we know we won't race against the virqfd +* going away because we hold wqh-lock to get here. +*/ + if (*(virqfd-pvirqfd) == virqfd) { + *(virqfd-pvirqfd) = NULL; + virqfd_deactivate(virqfd); + } + + spin_unlock_irqrestore(virqfd-vdev-irqlock, flags); + } return 0; } @@ -93,7 +108,6 @@ static void virqfd_ptable_queue_proc(struct file *file, static void virqfd_shutdown(struct work_struct *work) { struct virqfd *virqfd = container_of(work, struct virqfd, shutdown); - struct virqfd **pvirqfd = virqfd-pvirqfd; u64 cnt; eventfd_ctx_remove_wait_queue(virqfd-eventfd, virqfd-wait, cnt); @@ -101,7 +115,6 @@ static void virqfd_shutdown(struct work_struct *work) eventfd_ctx_put(virqfd-eventfd); kfree(virqfd); - *pvirqfd = NULL; } static void virqfd_inject(struct work_struct *work) @@ -122,15 +135,11 @@ static int virqfd_enable(struct vfio_pci_device *vdev, int ret = 0; unsigned int events; - if (*pvirqfd) - return -EBUSY; - virqfd = kzalloc(sizeof(*virqfd), GFP_KERNEL); if (!virqfd) return -ENOMEM; virqfd-pvirqfd = pvirqfd; - *pvirqfd = virqfd; virqfd-vdev = vdev; virqfd-handler = handler; virqfd-thread = thread; @@ -154,6 +163,23 @@ static int virqfd_enable(struct vfio_pci_device *vdev, virqfd-eventfd = ctx; /* +* virqfds can be released by closing the eventfd or directly +* through ioctl. These are both done through a workqueue, so +* we update the pointer to the virqfd under lock to avoid +* pushing multiple jobs to release the same virqfd. +*/ + spin_lock_irq(vdev-irqlock); + + if (*pvirqfd) { + spin_unlock_irq(vdev-irqlock); + ret = -EBUSY; + goto fail; + } + *pvirqfd = virqfd; + + spin_unlock_irq(vdev-irqlock); + + /* * Install our own custom wake-up handling so we are notified via * a callback whenever someone signals the underlying eventfd. */ @@ -187,19 +213,29 @@ fail: fput(file); kfree(virqfd); - *pvirqfd = NULL; return ret; } -static void virqfd_disable(struct virqfd *virqfd) +static void virqfd_disable(struct vfio_pci_device *vdev, + struct virqfd **pvirqfd) { - if (!virqfd) - return; + unsigned long flags; + + spin_lock_irqsave(vdev-irqlock, flags); + + if (*pvirqfd) { + virqfd_deactivate(*pvirqfd); + *pvirqfd = NULL; + } - virqfd_deactivate(virqfd); + spin_unlock_irqrestore(vdev-irqlock, flags); - /* Block until we know all outstanding shutdown jobs have completed. */ + /* +* Block until we know all outstanding shutdown jobs have completed. +* Even if we don't queue the job, flush the wq to be sure it's +* been released. +*/ flush_workqueue(vfio_irqfd_cleanup_wq); } @@ -392,8 +428,8 @@ static int vfio_intx_set_signal(struct vfio_pci_device *vdev,
Re: pci-assign terminates the guest upon pread() / pwrite() error?
On 09/21/2012 11:49 AM, Alex Williamson wrote: On Fri, 2012-09-21 at 11:17 -0400, Etienne Martineau wrote: On 09/20/2012 05:13 PM, Alex Williamson wrote: On Thu, 2012-09-20 at 16:36 -0400, Etienne Martineau wrote: On 09/20/2012 03:37 PM, Alex Williamson wrote: On Thu, 2012-09-20 at 15:08 -0400, Etienne Martineau wrote: On 09/20/2012 02:16 PM, Alex Williamson wrote: On Thu, 2012-09-20 at 13:27 -0400, Etienne Martineau wrote: In hw/kvm/pci-assign.c a pread() error part of assigned_dev_pci_read() result in a hw_error(). Similarly a pwrite() error part of assigned_dev_pci_write() also result in a hw_error(). Would there be a way to avoid terminating the guest for those cases? How about we deassign the device upon error? By terminating the guest we contain the error vs allowing the guest to continue running with invalid data. De-assigning the device is asynchronous and relies on guest involvement, so damage is potentially already done. Is this a theoretical problem or do you actually have hardware that hits this? Thanks, Alex This problem is in the context of a Hot-pluggable device assigned to the guest. If the guest rd/wr the config space at the same time than the device is physically taken out then the guest will terminate with hw_error(). Because this limits the availability of the guest I think we should try to recover instead. I don't see what other damage can happen since guest's MMIO access to the stale device will go nowhere? So you're looking at implementing surprise device removal? There's not just config space, there's slow bar access and mmap'd spaces to worry about too. What does going nowhere mean? If it means reads return -1 and the guest is trying to read the data portion of a packet from the network or an hba, we've now passed bad data to the guest. Thanks, Alex Thanks for your answer; Yes we are doing 'surprise device removal' for assigned device. Note that the problem also exist with standard 'attention button' device removal. The problem is all about fault isolation. Ideally, only the corresponding driver should be affected by this 'surprise device removal'. I think that taking down the guest is too coarse. Think about a 'surprise device removal' on the host. In that case the host is not taken down so why not do the same with the guest? It depends on the host hardware. Some x86 hardware will try to isolate the fault with an NMI other architectures such as ia64 would pull a machine check on a driver access to unresponsive devices. Yes some badness will be latched into the guest but really this not any different that having a mis-behaving device. ... which is a bad thing, but often undetectable. This is detectable. Thanks, Alex Our hardware is throwing a surprise link down PCIe AER and we are acting on it. I agree that for the generalized case NMI can be an issue. Let me ask you that question. What would be the best way to support device removal (surprise or not) for guest assigned device then? How about signaling the guest from vfio_pci_remove()? Thanks for using vfio! :) The 440fx chipset is really not designed to deal with these kinds of problems. Generally the best answer to how should we expose foo to the guest is to do it exactly like it is on the host. That means sending a surprise link down aer to the guest. That should be possible with q35. We are using q35 at this time for those reasons but the original qemu problem still exist. By the time the SPLD aer reached the guest, the device is physically gone on the host. Any transient guest MMIO/PCIcfg access to the stale assigned device can be fatal ( hw_error() ). We could potentially signal that in vfio_pci_remove, but we probably want to figure out how to relay the aer event to the guest and inject it into the emulated chipset. We tried that but there was some problems such as mangling the tlp to match the guest pci topology or the propagation latency caused by the chipset emulation layer during AER delivery. Right now we are using a straight lookup in the guest and fire the AER directly into the driver callback pci_error. We are doing that to minimize the exposition to the stale assigned device. thanks, Etienne -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC 1/2] kvm: Handle undercommitted guest case in PLE handler
On 09/21/2012 06:32 PM, Rik van Riel wrote: On 09/21/2012 08:00 AM, Raghavendra K T wrote: From: Raghavendra K T raghavendra...@linux.vnet.ibm.com When total number of VCPUs of system is less than or equal to physical CPUs, PLE exits become costly since each VCPU can have dedicated PCPU, and trying to find a target VCPU to yield_to just burns time in PLE handler. This patch reduces overhead, by simply doing a return in such scenarios by checking the length of current cpu runqueue. I am not convinced this is the way to go. The VCPU that is holding the lock, and is not releasing it, probably got scheduled out. That implies that VCPU is on a runqueue with at least one other task. I see your point here, we have two cases: case 1) rq1 : vcpu1-wait(lockA) (spinning) rq2 : vcpu2-holding(lockA) (running) Here Ideally vcpu1 should not enter PLE handler, since it would surely get the lock within ple_window cycle. (assuming ple_window is tuned for that workload perfectly). May be this explains why we are not seeing benefit with kernbench. On the other side, Since we cannot have a perfect ple_window tuned for all type of workloads, for those workloads, which may need more than 4096 cycles, we gain. thinking is it that we are seeing in benefited cases? case 2) rq1 : vcpu1-wait(lockA) (spinning) rq2 : vcpu3 (running) , vcpu2-holding(lockA) [scheduled out] I agree that checking rq1 length is not proper in this case, and as you rightly pointed out, we are in trouble here. nr_running()/num_online_cpus() would give more accurate picture here, but it seemed costly. May be load balancer save us a bit here in not running to such sort of cases. ( I agree load balancer is far too complex). -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/9] Revert mm: compaction: check lock contention first before taking lock
On Fri, Sep 21, 2012 at 11:46:15AM +0100, Mel Gorman wrote: This reverts mm-compaction-check-lock-contention-first-before-taking-lock.patch as it is replaced by a later patch in the series. Signed-off-by: Mel Gorman mgor...@suse.de Acked-by: Rafael Aquini aqu...@redhat.com -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/9] Revert mm-compaction-abort-compaction-loop-if-lock-is-contended-or-run-too-long-fix
On Fri, Sep 21, 2012 at 11:46:16AM +0100, Mel Gorman wrote: This reverts mm-compaction-abort-compaction-loop-if-lock-is-contended-or-run-too-long-fix as it is replaced by a later patch in the series. Signed-off-by: Mel Gorman mgor...@suse.de Acked-by: Rafael Aquini aqu...@redhat.com -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/9] Revert mm: compaction: abort compaction loop if lock is contended or run too long
On Fri, Sep 21, 2012 at 11:46:17AM +0100, Mel Gorman wrote: This reverts mm-compaction-abort-compaction-loop-if-lock-is-contended-or-run-too-long.patch as it is replaced by a later patch in the series. Signed-off-by: Mel Gorman mgor...@suse.de Acked-by: Rafael Aquini aqu...@redhat.com -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC 2/2] kvm: Be courteous to other VMs in overcommitted scenario in PLE handler
On 09/21/2012 07:22 PM, Rik van Riel wrote: On 09/21/2012 09:46 AM, Takuya Yoshikawa wrote: On Fri, 21 Sep 2012 17:30:20 +0530 Raghavendra K T raghavendra...@linux.vnet.ibm.com wrote: From: Raghavendra K T raghavendra...@linux.vnet.ibm.com When PLE handler fails to find a better candidate to yield_to, it goes back and does spin again. This is acceptable when we do not have overcommit. But in overcommitted scenarios (especially when we have large number of small guests), it is better to yield. Reviewed-by: Srikar Dronamraju sri...@linux.vnet.ibm.com Signed-off-by: Raghavendra K T raghavendra...@linux.vnet.ibm.com --- virt/kvm/kvm_main.c | 4 1 files changed, 4 insertions(+), 0 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8323685..713b677 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1660,6 +1660,10 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) } } } + /* In overcommitted cases, yield instead of spinning */ + if (!yielded rq_nr_running() 1) + schedule(); How about doing cond_resched() instead? Actually, an actual call to yield() may be better. That will set scheduler hints to make the scheduler pick another task for one round, while preserving this task's top position in the runqueue. I am not a scheduler expert, but I am also inclined towards Rik's suggestion here since we set skip buddy here. Takuya? -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 4/9] mm: compaction: Abort compaction loop if lock is contended or run too long
On Fri, Sep 21, 2012 at 11:46:18AM +0100, Mel Gorman wrote: From: Shaohua Li s...@fusionio.com Changelog since V2 o Fix BUG_ON triggered due to pages left on cc.migratepages o Make compact_zone_order() require non-NULL arg `contended' Changelog since V1 o only abort the compaction if lock is contended or run too long o Rearranged the code by Andrea Arcangeli. isolate_migratepages_range() might isolate no pages if for example when zone-lru_lock is contended and running asynchronous compaction. In this case, we should abort compaction, otherwise, compact_zone will run a useless loop and make zone-lru_lock is even contended. [minc...@kernel.org: Putback pages isolated for migration if aborting] [a...@linux-foundation.org: compact_zone_order requires non-NULL arg contended] Signed-off-by: Andrea Arcangeli aarca...@redhat.com Signed-off-by: Shaohua Li s...@fusionio.com Signed-off-by: Mel Gorman mgor...@suse.de Acked-by: Rik van Riel r...@redhat.com --- Acked-by: Rafael Aquini aqu...@redhat.com -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 5/9] mm: compaction: Acquire the zone-lru_lock as late as possible
On Fri, Sep 21, 2012 at 11:46:19AM +0100, Mel Gorman wrote: Compactions migrate scanner acquires the zone-lru_lock when scanning a range of pages looking for LRU pages to acquire. It does this even if there are no LRU pages in the range. If multiple processes are compacting then this can cause severe locking contention. To make matters worse commit b2eef8c0 (mm: compaction: minimise the time IRQs are disabled while isolating pages for migration) releases the lru_lock every SWAP_CLUSTER_MAX pages that are scanned. This patch makes two changes to how the migrate scanner acquires the LRU lock. First, it only releases the LRU lock every SWAP_CLUSTER_MAX pages if the lock is contended. This reduces the number of times it unnecessarily disables and re-enables IRQs. The second is that it defers acquiring the LRU lock for as long as possible. If there are no LRU pages or the only LRU pages are transhuge then the LRU lock will not be acquired at all which reduces contention on zone-lru_lock. Signed-off-by: Mel Gorman mgor...@suse.de Acked-by: Rik van Riel r...@redhat.com --- Acked-by: Rafael Aquini aqu...@redhat.com -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 6/9] mm: compaction: Acquire the zone-lock as late as possible
On Fri, Sep 21, 2012 at 11:46:20AM +0100, Mel Gorman wrote: Compactions free scanner acquires the zone-lock when checking for PageBuddy pages and isolating them. It does this even if there are no PageBuddy pages in the range. This patch defers acquiring the zone lock for as long as possible. In the event there are no free pages in the pageblock then the lock will not be acquired at all which reduces contention on zone-lock. Signed-off-by: Mel Gorman mgor...@suse.de Acked-by: Rik van Riel r...@redhat.com --- Acked-by: Rafael Aquini aqu...@redhat.com -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 7/9] Revert mm: have order 0 compaction start off where it left
On Fri, Sep 21, 2012 at 11:46:21AM +0100, Mel Gorman wrote: This reverts commit 7db8889a (mm: have order 0 compaction start off where it left) and commit de74f1cc (mm: have order 0 compaction start near a pageblock with free pages). These patches were a good idea and tests confirmed that they massively reduced the amount of scanning but the implementation is complex and tricky to understand. A later patch will cache what pageblocks should be skipped and reimplements the concept of compact_cached_free_pfn on top for both migration and free scanners. Signed-off-by: Mel Gorman mgor...@suse.de Acked-by: Rik van Riel r...@redhat.com --- Acked-by: Rafael Aquini aqu...@redhat.com -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 8/9] mm: compaction: Cache if a pageblock was scanned and no pages were isolated
On Fri, Sep 21, 2012 at 11:46:22AM +0100, Mel Gorman wrote: When compaction was implemented it was known that scanning could potentially be excessive. The ideal was that a counter be maintained for each pageblock but maintaining this information would incur a severe penalty due to a shared writable cache line. It has reached the point where the scanning costs are an serious problem, particularly on long-lived systems where a large process starts and allocates a large number of THPs at the same time. Instead of using a shared counter, this patch adds another bit to the pageblock flags called PG_migrate_skip. If a pageblock is scanned by either migrate or free scanner and 0 pages were isolated, the pageblock is marked to be skipped in the future. When scanning, this bit is checked before any scanning takes place and the block skipped if set. The main difficulty with a patch like this is when to ignore the cached information? If it's ignored too often, the scanning rates will still be excessive. If the information is too stale then allocations will fail that might have otherwise succeeded. In this patch o CMA always ignores the information o If the migrate and free scanner meet then the cached information will be discarded if it's at least 5 seconds since the last time the cache was discarded o If there are a large number of allocation failures, discard the cache. The time-based heuristic is very clumsy but there are few choices for a better event. Depending solely on multiple allocation failures still allows excessive scanning when THP allocations are failing in quick succession due to memory pressure. Waiting until memory pressure is relieved would cause compaction to continually fail instead of using reclaim/compaction to try allocate the page. The time-based mechanism is clumsy but a better option is not obvious. Signed-off-by: Mel Gorman mgor...@suse.de Acked-by: Rik van Riel r...@redhat.com --- Acked-by: Rafael Aquini aqu...@redhat.com -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 9/9] mm: compaction: Restart compaction from near where it left off
On Fri, Sep 21, 2012 at 11:46:23AM +0100, Mel Gorman wrote: This is almost entirely based on Rik's previous patches and discussions with him about how this might be implemented. Order 0 compaction stops when enough free pages of the correct page order have been coalesced. When doing subsequent higher order allocations, it is possible for compaction to be invoked many times. However, the compaction code always starts out looking for things to compact at the start of the zone, and for free pages to compact things to at the end of the zone. This can cause quadratic behaviour, with isolate_freepages starting at the end of the zone each time, even though previous invocations of the compaction code already filled up all free memory on that end of the zone. This can cause isolate_freepages to take enormous amounts of CPU with certain workloads on larger memory systems. This patch caches where the migration and free scanner should start from on subsequent compaction invocations using the pageblock-skip information. When compaction starts it begins from the cached restart points and will update the cached restart points until a page is isolated or a pageblock is skipped that would have been scanned by synchronous compaction. Signed-off-by: Mel Gorman mgor...@suse.de Acked-by: Rik van Riel r...@redhat.com --- Acked-by: Rafael Aquini aqu...@redhat.com -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v11] kvm: Add resampling irqfds for level triggered interrupts
To emulate level triggered interrupts, add a resample option to KVM_IRQFD. When specified, a new resamplefd is provided that notifies the user when the irqchip has been resampled by the VM. This may, for instance, indicate an EOI. Also in this mode, posting of an interrupt through an irqfd only asserts the interrupt. On resampling, the interrupt is automatically de-asserted prior to user notification. This enables level triggered interrupts to be posted and re-enabled from vfio with no userspace intervention. All resampling irqfds can make use of a single irq source ID, so we reserve a new one for this interface. Signed-off-by: Alex Williamson alex.william...@redhat.com --- Taking Michael's idea to create our own lock for resampling irqfds simplifies things significantly. We can now avoid any nesting with the irqfds.lock spinlock so we can call everything we need to directly. Documentation/virtual/kvm/api.txt | 13 +++ arch/x86/kvm/x86.c|4 + include/linux/kvm.h | 12 +++ include/linux/kvm_host.h |5 + virt/kvm/eventfd.c| 150 - virt/kvm/irq_comm.c |6 + 6 files changed, 184 insertions(+), 6 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index bf33aaa..6240366 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1946,6 +1946,19 @@ the guest using the specified gsi pin. The irqfd is removed using the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd and kvm_irqfd.gsi. +With KVM_CAP_IRQFD_RESAMPLE, KVM_IRQFD supports a de-assert and notify +mechanism allowing emulation of level-triggered, irqfd-based +interrupts. When KVM_IRQFD_FLAG_RESAMPLE is set the user must pass an +additional eventfd in the kvm_irqfd.resamplefd field. When operating +in resample mode, posting of an interrupt through kvm_irq.fd asserts +the specified gsi in the irqchip. When the irqchip is resampled, such +as from an EOI, the gsi is de-asserted and the user is notifed via +kvm_irqfd.resamplefd. It is the user's responsibility to re-queue +the interrupt if the device making use of it still requires service. +Note that closing the resamplefd is not sufficient to disable the +irqfd. The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment +and need not be specified with KVM_IRQFD_FLAG_DEASSIGN. + 4.76 KVM_PPC_ALLOCATE_HTAB Capability: KVM_CAP_PPC_ALLOC_HTAB diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2966c84..56f9002 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2177,6 +2177,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_GET_TSC_KHZ: case KVM_CAP_PCI_2_3: case KVM_CAP_KVMCLOCK_CTRL: + case KVM_CAP_IRQFD_RESAMPLE: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -6268,6 +6269,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, kvm-arch.irq_sources_bitmap); + /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */ + set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, + kvm-arch.irq_sources_bitmap); raw_spin_lock_init(kvm-arch.tsc_write_lock); diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 2ce09aa..a01a3d5 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -618,6 +618,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_PPC_GET_SMMU_INFO 78 #define KVM_CAP_S390_COW 79 #define KVM_CAP_PPC_ALLOC_HTAB 80 +#define KVM_CAP_IRQFD_RESAMPLE 81 #ifdef KVM_CAP_IRQ_ROUTING @@ -683,12 +684,21 @@ struct kvm_xen_hvm_config { #endif #define KVM_IRQFD_FLAG_DEASSIGN (1 0) +/* + * Available with KVM_CAP_IRQFD_RESAMPLE + * + * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies + * the irqfd to operate in resampling mode for level triggered interrupt + * emlation. See Documentation/virtual/kvm/api.txt. + */ +#define KVM_IRQFD_FLAG_RESAMPLE (1 1) struct kvm_irqfd { __u32 fd; __u32 gsi; __u32 flags; - __u8 pad[20]; + __u32 resamplefd; + __u8 pad[16]; }; struct kvm_clock_data { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b70b48b..6966ce2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -70,7 +70,8 @@ #define KVM_REQ_PMU 16 #define KVM_REQ_PMI 17 -#define KVM_USERSPACE_IRQ_SOURCE_ID0 +#define KVM_USERSPACE_IRQ_SOURCE_ID0 +#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 struct kvm; struct kvm_vcpu; @@ -283,6 +284,8 @@ struct kvm { struct { spinlock_tlock; struct list_head items; + struct list_head resampler_list; + struct mutex resampler_lock; } irqfds;
Re: [Qemu-devel] [PATCH v5 0/4] VFIO-based PCI device assignment
Ping. There don't seem to be any objections to this. Thanks, Alex On Fri, 2012-09-14 at 17:04 -0600, Alex Williamson wrote: On Fri, 2012-09-14 at 17:01 -0600, Alex Williamson wrote: Same goodness as v4, plus: - Addressed comments by Blue Swirl (thanks for the review) (hopefully w/o breaking anything wrt slow bar endianness) - Fixed a couple checkpatch warnings that snuck in BTW, this works fine with Jason's Q35 patches though we will need to add INTx routing support for KVM accelerated INTx (and pci-assign). Thanks, Oh, as usual, the following branch is updated: git://github.com/awilliam/qemu-vfio.git vfio-for-qemu and a new tag is added: vfio-pci-for-qemu-v5 Thanks, Alex --- Alex Williamson (4): vfio: Enable vfio-pci and mark supported vfio: vfio-pci device assignment driver Update Linux kernel headers Update kernel header script to include vfio MAINTAINERS |5 configure |6 hw/Makefile.objs|3 hw/vfio_pci.c | 1864 +++ hw/vfio_pci_int.h | 114 ++ linux-headers/linux/vfio.h | 368 scripts/update-linux-headers.sh |2 7 files changed, 2360 insertions(+), 2 deletions(-) create mode 100644 hw/vfio_pci.c create mode 100644 hw/vfio_pci_int.h create mode 100644 linux-headers/linux/vfio.h -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 1/1] target-i386: Fix default Hypervisor level for hypervisor-vendor=kvm.
On 09/21/12 10:18, Eduardo Habkost wrote: On Thu, Sep 20, 2012 at 04:06:27PM -0400, Don Slutz wrote: From http://lkml.indiana.edu/hypermail/linux/kernel/1205.0/00100.html EAX should be KVM_CPUID_FEATURES (0x4001) not 0. Added hypervisor-vendor=kvm0 to get the older CPUID result. kvm1 selects the newer one. Why not just make hypervisor-vendor=kvm control only the hypervisor vendor string, and support something like kvm-hypervisor-level=0 to restore the old cpuid_hv_level=0 behavior? -cpu host,hypervisor-vendor=kvm,hypervisor-level=0 Does this. This is similar to the kvmclock case: it would allow us to make hypervisor-vendor=kvm use saner values as default, but letting old machine-types to override it for compatibility if required. Right now since I am using env-cpuid_hv_level == 0 as a flag. This means that: -cpu host,hypervisor-level=0,hypervisor-vendor=kvm -cpu host,hypervisor-vendor=kvm,hypervisor-level=0 end up with different CPUID data (Which I do not like). I will fix this in the next round. Did you want me to drop kvm0 and kvm1? -Don [...] -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [libvirt] TSC scaling interface to management
On 09/21/2012 05:51 AM, Marcelo Tosatti wrote: On Fri, Sep 21, 2012 at 12:02:46AM +0300, Dor Laor wrote: On 09/12/2012 06:39 PM, Marcelo Tosatti wrote: HW TSC scaling is a feature of AMD processors that allows a multiplier to be specified to the TSC frequency exposed to the guest. KVM also contains provision to trap TSC (KVM: Infrastructure for software and hardware based TSC rate scaling cc578287e3224d0da) or advance TSC frequency. This is useful when migrating to a host with different frequency and the guest is possibly using direct RDTSC instructions for purposes other than measuring cycles (that is, it previously calculated cycles-per-second, and uses that information which is stale after migration). qemu-x86: Set tsc_khz in kvm when supported (e7429073ed1a76518) added support for tsc_khz= option in QEMU. I am proposing the following changes so that management applications can work with this: 1) New option for tsc_khz, which is tsc_khz=host (QEMU command line option). Host means that QEMU is responsible for retrieving the TSC frequency of the host processor and use that. Management application does not have to deal with the burden. 2) New subsection with tsc_khz value. Destination host should consult supported features of running kernel and fail if feature is unsupported. It is not necessary to use this tsc_khz setting with modern guests using paravirtual clocks, or when its known that applications make proper use of the time interface provided by operating systems. On the other hand, legacy applications or setups which require no modification and correct operation while virtualized and make use of RDTSC might need this. Therefore it appears that this tsc_khz=auto option can be specified only if the user specifies so (it can be a per-guest flag hidden in the management configuration/manual). Sending this email to gather suggestions (or objections) to this interface. I'm not sure I understand the exact difference between the offers. We can define these 3 options: 1. Qemu/kvm won't make use of tsc scaling feature at all. 2. tsc scaling is used and we take the value either from the host or from the live migration data that overrides the later for incoming. As you've said, it should be passed through a sub section. 3. Manual setting of the value (uncommon). Is there another option worth considering? The questions is what should be the default. IMHO #2 is more appropriate to serve as a default since we do expect tsc to change between hosts. Option 1. is more appropriate to serve as a default given that modern guests make use of paravirt, as you have observed. but you also observed that legacy applications that use rdtsc (even over pv kernel) will still be affected by the physical tsc frequency. Since I'm not aware of downside for using scaling, I rather pick opt #2 as a default. That is, tsc scaling is only required if the guest does direct RDTSC on the expectation that the value won't change. Cheers, Dor -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 1/1] target-i386: Fix default Hypervisor level for hypervisor-vendor=kvm.
On 09/21/12 16:49, Eduardo Habkost wrote: On Fri, Sep 21, 2012 at 04:26:58PM -0400, Don Slutz wrote: On 09/21/12 10:18, Eduardo Habkost wrote: On Thu, Sep 20, 2012 at 04:06:27PM -0400, Don Slutz wrote: From http://lkml.indiana.edu/hypermail/linux/kernel/1205.0/00100.html EAX should be KVM_CPUID_FEATURES (0x4001) not 0. Added hypervisor-vendor=kvm0 to get the older CPUID result. kvm1 selects the newer one. Why not just make hypervisor-vendor=kvm control only the hypervisor vendor string, and support something like kvm-hypervisor-level=0 to restore the old cpuid_hv_level=0 behavior? -cpu host,hypervisor-vendor=kvm,hypervisor-level=0 Does this. Good. :-) This is similar to the kvmclock case: it would allow us to make hypervisor-vendor=kvm use saner values as default, but letting old machine-types to override it for compatibility if required. Right now since I am using env-cpuid_hv_level == 0 as a flag. This means that: -cpu host,hypervisor-level=0,hypervisor-vendor=kvm -cpu host,hypervisor-vendor=kvm,hypervisor-level=0 end up with different CPUID data (Which I do not like). I will fix this in the next round. Right. This has to be fixed. Did you want me to drop kvm0 and kvm1? Yes, if level is already configurable using the hypervisor-level property, I don't see the need for kvm0 and kvm1. If you make kvm_arch_init_vcpu() actually use those fields, you will end up implementing what's required to allow migration compatibility to be kept (the only thing missing is to make the CPU class a child of DeviceState, and add hypervisor-level=0 to the existing machine-types). :-) You mean like http://lists.gnu.org/archive/html/qemu-devel/2012-09/msg03402.html and http://lists.gnu.org/archive/html/qemu-devel/2012-09/msg03405.html already change kvm_arch_init_vcpu(). I did not know that I need to make the CPU class a child of DeviceState. Nor that I needed to add hypervisor-vendor=kvm,hypervisor-level=0 to the existing machine-types. Since without specifying hypervisor-level=0 it defaults to 0 and kvm_arch_init_vcpu() will default to setting hypervisor-vendor=kvm. -Don -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 4/9] mm: compaction: Abort compaction loop if lock is contended or run too long
On Fri, 21 Sep 2012 11:46:18 +0100 Mel Gorman mgor...@suse.de wrote: Changelog since V2 o Fix BUG_ON triggered due to pages left on cc.migratepages o Make compact_zone_order() require non-NULL arg `contended' Changelog since V1 o only abort the compaction if lock is contended or run too long o Rearranged the code by Andrea Arcangeli. isolate_migratepages_range() might isolate no pages if for example when zone-lru_lock is contended and running asynchronous compaction. In this case, we should abort compaction, otherwise, compact_zone will run a useless loop and make zone-lru_lock is even contended. hm, this appears to be identical to mm-compaction-abort-compaction-loop-if-lock-is-contended-or-run-too-long.patch mm-compaction-abort-compaction-loop-if-lock-is-contended-or-run-too-long-fix.patch mm-compaction-abort-compaction-loop-if-lock-is-contended-or-run-too-long-fix-2.patch so I simply omitted patches 2, 3 and 4. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 6/9] mm: compaction: Acquire the zone-lock as late as possible
On Fri, 21 Sep 2012 11:46:20 +0100 Mel Gorman mgor...@suse.de wrote: Compactions free scanner acquires the zone-lock when checking for PageBuddy pages and isolating them. It does this even if there are no PageBuddy pages in the range. This patch defers acquiring the zone lock for as long as possible. In the event there are no free pages in the pageblock then the lock will not be acquired at all which reduces contention on zone-lock. ... --- a/mm/compaction.c +++ b/mm/compaction.c @@ -93,6 +93,28 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock, return compact_checklock_irqsave(lock, flags, false, cc); } +/* Returns true if the page is within a block suitable for migration to */ +static bool suitable_migration_target(struct page *page) +{ + stray newline + int migratetype = get_pageblock_migratetype(page); + + /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ + if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) + return false; + + /* If the page is a large free page, then allow migration */ + if (PageBuddy(page) page_order(page) = pageblock_order) + return true; + + /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ + if (migrate_async_suitable(migratetype)) + return true; + + /* Otherwise skip the block */ + return false; +} + ... @@ -168,23 +193,38 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn, int isolated, i; struct page *page = cursor; - if (!pfn_valid_within(blockpfn)) { - if (strict) - return 0; - continue; - } + if (!pfn_valid_within(blockpfn)) + goto strict_check; nr_scanned++; - if (!PageBuddy(page)) { - if (strict) - return 0; - continue; - } + if (!PageBuddy(page)) + goto strict_check; + + /* + * The zone lock must be held to isolate freepages. This + * unfortunately this is a very coarse lock and can be this this + * heavily contended if there are parallel allocations + * or parallel compactions. For async compaction do not + * spin on the lock and we acquire the lock as late as + * possible. + */ + locked = compact_checklock_irqsave(cc-zone-lock, flags, + locked, cc); + if (!locked) + break; + + /* Recheck this is a suitable migration target under lock */ + if (!strict !suitable_migration_target(page)) + break; + + /* Recheck this is a buddy page under lock */ + if (!PageBuddy(page)) + goto strict_check; /* Found a free page, break it into order-0 pages */ isolated = split_free_page(page); if (!isolated strict) - return 0; + goto strict_check; total_isolated += isolated; for (i = 0; i isolated; i++) { list_add(page-lru, freelist); @@ -196,9 +236,23 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn, blockpfn += isolated - 1; cursor += isolated - 1; } + + continue; + +strict_check: + /* Abort isolation if the caller requested strict isolation */ + if (strict) { + total_isolated = 0; + goto out; + } } trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); + +out: + if (locked) + spin_unlock_irqrestore(cc-zone-lock, flags); + return total_isolated; } A a few things about this function. Would it be cleaner if we did if (!strict) { if (!suitable_migration_target(page)) break; } else { if (!PageBuddy(page)) goto strict_check; } and then remove the test of `strict' from strict_check (which then should be renamed)? Which perhaps means that the whole `strict_check' block can go away: if (!strict) { if (!suitable_migration_target(page)) break; } else { if (!PageBuddy(page)) { total_isolated = 0; goto out; } Have a think about it? The function is a little straggly. Secondly, it is correct/desirable to skip the (now misnamed
Re: [PATCH 8/9] mm: compaction: Cache if a pageblock was scanned and no pages were isolated
On Fri, 21 Sep 2012 11:46:22 +0100 Mel Gorman mgor...@suse.de wrote: When compaction was implemented it was known that scanning could potentially be excessive. The ideal was that a counter be maintained for each pageblock but maintaining this information would incur a severe penalty due to a shared writable cache line. It has reached the point where the scanning costs are an serious problem, particularly on long-lived systems where a large process starts and allocates a large number of THPs at the same time. Instead of using a shared counter, this patch adds another bit to the pageblock flags called PG_migrate_skip. If a pageblock is scanned by either migrate or free scanner and 0 pages were isolated, the pageblock is marked to be skipped in the future. When scanning, this bit is checked before any scanning takes place and the block skipped if set. The main difficulty with a patch like this is when to ignore the cached information? If it's ignored too often, the scanning rates will still be excessive. If the information is too stale then allocations will fail that might have otherwise succeeded. In this patch o CMA always ignores the information o If the migrate and free scanner meet then the cached information will be discarded if it's at least 5 seconds since the last time the cache was discarded o If there are a large number of allocation failures, discard the cache. The time-based heuristic is very clumsy but there are few choices for a better event. Depending solely on multiple allocation failures still allows excessive scanning when THP allocations are failing in quick succession due to memory pressure. Waiting until memory pressure is relieved would cause compaction to continually fail instead of using reclaim/compaction to try allocate the page. The time-based mechanism is clumsy but a better option is not obvious. ick. Wall time has sooo little relationship to what's happening in there. If we *have* to use polling, cannot we clock the poll with some metric which is at least vaguely related to the amount of activity? Number (or proportion) of pages scanned, for example? Or reset everything on the Nth trip around the zone? Or even a combination of one of these *and* of wall time, so the system will at least work harder when MM is under load. Also, what has to be done to avoid the polling altogether? eg/ie, zap a pageblock's PB_migrate_skip synchronously, when something was done to that pageblock which justifies repolling it? ... +static void reset_isolation_suitable(struct zone *zone) +{ + unsigned long start_pfn = zone-zone_start_pfn; + unsigned long end_pfn = zone-zone_start_pfn + zone-spanned_pages; + unsigned long pfn; + + /* + * Do not reset more than once every five seconds. If allocations are + * failing sufficiently quickly to allow this to happen then continually + * scanning for compaction is not going to help. The choice of five + * seconds is arbitrary but will mitigate excessive scanning. + */ + if (time_before(jiffies, zone-compact_blockskip_expire)) + return; + zone-compact_blockskip_expire = jiffies + (HZ * 5); + + /* Walk the zone and mark every pageblock as suitable for isolation */ + for (pfn = start_pfn; pfn end_pfn; pfn += pageblock_nr_pages) { + struct page *page; + if (!pfn_valid(pfn)) + continue; + + page = pfn_to_page(pfn); + if (zone != page_zone(page)) + continue; + + clear_pageblock_skip(page); + } What's the worst-case loop count here? +} + ... -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 1/1] target-i386: Fix default Hypervisor level for hypervisor-vendor=kvm.
On Fri, Sep 21, 2012 at 05:28:27PM -0400, Don Slutz wrote: On 09/21/12 16:49, Eduardo Habkost wrote: On Fri, Sep 21, 2012 at 04:26:58PM -0400, Don Slutz wrote: On 09/21/12 10:18, Eduardo Habkost wrote: On Thu, Sep 20, 2012 at 04:06:27PM -0400, Don Slutz wrote: From http://lkml.indiana.edu/hypermail/linux/kernel/1205.0/00100.html EAX should be KVM_CPUID_FEATURES (0x4001) not 0. Added hypervisor-vendor=kvm0 to get the older CPUID result. kvm1 selects the newer one. Why not just make hypervisor-vendor=kvm control only the hypervisor vendor string, and support something like kvm-hypervisor-level=0 to restore the old cpuid_hv_level=0 behavior? -cpu host,hypervisor-vendor=kvm,hypervisor-level=0 Does this. Good. :-) This is similar to the kvmclock case: it would allow us to make hypervisor-vendor=kvm use saner values as default, but letting old machine-types to override it for compatibility if required. Right now since I am using env-cpuid_hv_level == 0 as a flag. This means that: -cpu host,hypervisor-level=0,hypervisor-vendor=kvm -cpu host,hypervisor-vendor=kvm,hypervisor-level=0 end up with different CPUID data (Which I do not like). I will fix this in the next round. Right. This has to be fixed. Did you want me to drop kvm0 and kvm1? Yes, if level is already configurable using the hypervisor-level property, I don't see the need for kvm0 and kvm1. If you make kvm_arch_init_vcpu() actually use those fields, you will end up implementing what's required to allow migration compatibility to be kept (the only thing missing is to make the CPU class a child of DeviceState, and add hypervisor-level=0 to the existing machine-types). :-) You mean like http://lists.gnu.org/archive/html/qemu-devel/2012-09/msg03402.html and http://lists.gnu.org/archive/html/qemu-devel/2012-09/msg03405.html already change kvm_arch_init_vcpu(). Yes! Sorry, I hadn't reviewed all of your previous series yet. :-) I did not know that I need to make the CPU class a child of DeviceState. You don't. But we plan to do that, eventually, so we can put CPU compatibility properties into machine-types. Nor that I needed to add hypervisor-vendor=kvm,hypervisor-level=0 to the existing machine-types. Since without specifying hypervisor-level=0 it defaults to 0 and kvm_arch_init_vcpu() will default to setting hypervisor-vendor=kvm. What I would like to do is: to change the default to 0x4001 (that's the correct value), but make the pc-1.1 and older machine-types keep the hypervisor-level=0 default for compatibility. (But to be able to do that, we need to first make the CPU class a child of DeviceState, so that's something to be done later) -- Eduardo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH v3 11/19] Implement qmp and hmp commands for notification lists
On 09/21/2012 05:17 AM, Vasilis Liaskovitis wrote: Guest can respond to ACPI hotplug events e.g. with _EJ or _OST method. This patch implements a tail queue to store guest notifications for memory hot-add and hot-remove requests. Guest responses for memory hotplug command on a per-dimm basis can be detected with the new hmp command info memhp or the new qmp command query-memhp Naming doesn't match the QMP code. Examples: (qemu) device_add dimm,id=ram0 These notification items should probably be part of migration state (not yet implemented). In the case of libvirt driving migration, you already said in 10/19 that libvirt has to start the destination with the populated=on|off fields correct for each dimm according to the state it was in at the time the host started the update. Can the host hot unplug memory after migration has started? + +## +# @MemHpInfo: +# +# Information about status of a memory hotplug command +# +# @dimm: the Dimm associated with the result +# +# @result: the result of the hotplug command +# +# Since: 1.3 +# +## +{ 'type': 'MemHpInfo', + 'data': {'dimm': 'str', 'request': 'str', 'result': 'str'} } Should 'result' be a bool (true for success, false for still pending) or an enum, instead of a free-form string? Likewise, isn't 'request' going to be exactly one of two values (plug or unplug)? -- Eric Blake ebl...@redhat.com+1-919-301-3266 Libvirt virtualization library http://libvirt.org signature.asc Description: OpenPGP digital signature
Re: [RFC PATCH v3 12/19] Implement info memory-total and query-memory-total
On 09/21/2012 05:17 AM, Vasilis Liaskovitis wrote: Returns total physical memory available to guest in bytes, including hotplugged memory. Note that the number reported here may be different from what the guest sees e.g. if the guest has not logically onlined hotplugged memory. This functionality is provided independently of a balloon device, since a guest can be using ACPI memory hotplug without using a balloon device. Signed-off-by: Vasilis Liaskovitis vasilis.liaskovi...@profitbricks.com +++ b/hmp-commands.hx @@ -1464,6 +1464,8 @@ show qdev device model list show roms @item info memory-hotplug show memory-hotplug +@item info memory-total +show memory-total +++ b/monitor.c @@ -2747,6 +2747,13 @@ static mon_cmd_t info_cmds[] = { .mhandler.info = hmp_info_memory_hotplug, }, { +.name = memory-total, We're split on HMP naming conventions ('-' vs. '_'); we have 'show migrate_capabilities' but 'show block-jobs'. Oh well, that's life. Reviewed-by: Eric Blake ebl...@redhat.com -- Eric Blake ebl...@redhat.com+1-919-301-3266 Libvirt virtualization library http://libvirt.org signature.asc Description: OpenPGP digital signature
[PATCH v5 00/17] Allow changing of Hypervisor CPUIDs.
Also known as Paravirtualization CPUIDs. This is primarily done so that the guest will think it is running under vmware when hypervisor-vendor=vmware is specified as a property of a cpu. This depends on: http://lists.gnu.org/archive/html/qemu-devel/2012-09/msg01400.html As far as I know it is #4. It depends on (1) and (2) and (3). This change is based on: Microsoft Hypervisor CPUID Leaves: http://msdn.microsoft.com/en-us/library/windows/hardware/ff542428%28v=vs.85%29.aspx Linux kernel change starts with: http://fixunix.com/kernel/538707-use-cpuid-communicate-hypervisor.html Also: http://lkml.indiana.edu/hypermail/linux/kernel/1205.0/00100.html VMware documention on CPUIDs (Mechanisms to determine if software is running in a VMware virtual machine): http://kb.vmware.com/selfservice/microsites/search.do?language=en_UScmd=displayKCexternalId=1009458 Changes from v4 to v5: Undo kvm_clock2 change. Add cpuid_hv_level_set; cpuid_hv_level == 0 is now valid. Add cpuid_hv_vendor_set; the null string is now valid. Handle kvm and cpuid_hv_level == 0. hypervisor-vendor=kvm,hypervisor-level=0 and hypervisor-level=0,hypervisor-vendor=kvm now do the same thing. Changes from v3 to v4: Added CPUID_HV_LEVEL_HYPERV, CPUID_HV_LEVEL_KVM. Added CPUID_HV_VENDOR_HYPERV. Added hyperv as known hypservisor-vendor. Allow hypervisor-level to be 0. Changes from v2 to v3: Clean post to qemu-devel. Changes from v1 to v2: 1) Added 1/4 from http://lists.gnu.org/archive/html/qemu-devel/2012-08/msg05153.html Because Fred is changing jobs and so will not be pushing to get this in. It needed to be rebased, And I needed it to complete the testing of this change. 2) Added 2/4 because of the re-work I needed a way to clear all KVM bits, 3) The rework of v1. Make it fit into the object model re-work of cpu.c for x86. 4) Added 3/4 -- The split out of the code that is not needed for accel=kvm. Changes from v2 to v3: Marcelo Tosatti: Its one big patch, better split in logically correlated patches (with better changelog). This would help reviewers. So split 3 and 4 into 3 to 17. More info in change log. No code change. Don Slutz (17): target-i386: Allow tsc-frequency to be larger then 2.147G target-i386: Add missing kvm bits. target-i386: Add Hypervisor level. target-i386: Add cpu object access routines for Hypervisor level. target-i386: Add cpu object access routines for Hypervisor level. target-i386: Use Hypervisor level in -machine pc,accel=kvm. target-i386: Use Hypervisor level in -machine pc,accel=tcg. target-i386: Add Hypervisor vendor. target-i386: Add cpu object access routines for Hypervisor vendor. target-i386: Use Hypervisor vendor in -machine pc,accel=kvm. target-i386: Use Hypervisor vendor in -machine pc,accel=tcg. target-i386: Add some known names to Hypervisor vendor. target-i386: Add optional Hypervisor leaf extra. target-i386: Add cpu object access routines for Hypervisor leaf extra. target-i386: Add setting of Hypervisor leaf extra for known vmare4. target-i386: Use Hypervisor leaf extra in -machine pc,accel=kvm. target-i386: Use Hypervisor leaf extra in -machine pc,accel=tcg. target-i386/cpu.c | 285 - target-i386/cpu.h | 34 +++ target-i386/kvm.c | 33 +- 3 files changed, 341 insertions(+), 11 deletions(-) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 01/17] target-i386: Allow tsc-frequency to be larger then 2.147G
The check using INT_MAX (2147483647) is wrong in this case. Signed-off-by: Fred Oliveira folive...@cloudswitch.com Signed-off-by: Don Slutz d...@cloudswitch.com --- target-i386/cpu.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/target-i386/cpu.c b/target-i386/cpu.c index af50a8f..0313cf5 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -1146,7 +1146,7 @@ static void x86_cpuid_set_tsc_freq(Object *obj, Visitor *v, void *opaque, { X86CPU *cpu = X86_CPU(obj); const int64_t min = 0; -const int64_t max = INT_MAX; +const int64_t max = INT64_MAX; int64_t value; visit_type_freq(v, value, name, errp); -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 03/17] target-i386: Add Hypervisor level.
Also known as Paravirtualization level or maximim cpuid function present in this leaf. This is just the EAX value for 0x4000. QEMU knows this is KVM_CPUID_SIGNATURE (0x4000). This is based on: Microsoft Hypervisor CPUID Leaves: http://msdn.microsoft.com/en-us/library/windows/hardware/ff542428%28v=vs.85%29.aspx Linux kernel change starts with: http://fixunix.com/kernel/538707-use-cpuid-communicate-hypervisor.html Also: http://lkml.indiana.edu/hypermail/linux/kernel/1205.0/00100.html VMware documention on CPUIDs (Mechanisms to determine if software is running in a VMware virtual machine): http://kb.vmware.com/selfservice/microsites/search.do?language=en_UScmd=displayKCexternalId=1009458 QEMU has the value HYPERV_CPUID_MIN defined. Signed-off-by: Don Slutz d...@cloudswitch.com --- target-i386/cpu.h |3 +++ 1 files changed, 3 insertions(+), 0 deletions(-) diff --git a/target-i386/cpu.h b/target-i386/cpu.h index 5265c5a..1899f69 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -777,11 +777,14 @@ typedef struct CPUX86State { uint32_t cpuid_ext3_features; uint32_t cpuid_apic_id; bool cpuid_vendor_override; +bool cpuid_hv_level_set; /* Store the results of Centaur's CPUID instructions */ uint32_t cpuid_xlevel2; uint32_t cpuid_ext4_features; /* Flags from CPUID[EAX=7,ECX=0].EBX */ uint32_t cpuid_7_0_ebx; +/* Hypervisor CPUIDs */ +uint32_t cpuid_hv_level; /* MTRRs */ uint64_t mtrr_fixed[11]; -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 02/17] target-i386: Add missing kvm bits.
Fix duplicate name (kvmclock = kvm_clock2) also. Signed-off-by: Don Slutz d...@cloudswitch.com --- target-i386/cpu.c | 12 1 files changed, 8 insertions(+), 4 deletions(-) diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 0313cf5..25ca986 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -87,10 +87,14 @@ static const char *ext3_feature_name[] = { }; static const char *kvm_feature_name[] = { -kvmclock, kvm_nopiodelay, kvm_mmu, kvmclock, kvm_asyncpf, NULL, kvm_pv_eoi, NULL, -NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, -NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, -NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +kvmclock, kvm_nopiodelay, kvm_mmu, kvmclock, +kvm_asyncpf, kvm_steal_time, kvm_pv_eoi, NULL, +NULL, NULL, NULL, NULL, +NULL, NULL, NULL, NULL, +NULL, NULL, NULL, NULL, +NULL, NULL, NULL, NULL, +kvm_clock_stable, NULL, NULL, NULL, +NULL, NULL, NULL, NULL, }; static const char *svm_feature_name[] = { -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 04/17] target-i386: Add cpu object access routines for Hypervisor level.
These are modeled after x86_cpuid_get_xlevel and x86_cpuid_set_xlevel. Signed-off-by: Don Slutz d...@cloudswitch.com --- target-i386/cpu.c | 29 + 1 files changed, 29 insertions(+), 0 deletions(-) diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 25ca986..7b31de9 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -1166,6 +1166,32 @@ static void x86_cpuid_set_tsc_freq(Object *obj, Visitor *v, void *opaque, cpu-env.tsc_khz = value / 1000; } +static void x86_cpuid_get_hv_level(Object *obj, Visitor *v, void *opaque, +const char *name, Error **errp) +{ +X86CPU *cpu = X86_CPU(obj); + +visit_type_uint32(v, cpu-env.cpuid_hv_level, name, errp); +} + +static void x86_cpuid_set_hv_level(Object *obj, Visitor *v, void *opaque, +const char *name, Error **errp) +{ +X86CPU *cpu = X86_CPU(obj); +uint32_t value; + +visit_type_uint32(v, value, name, errp); +if (error_is_set(errp)) { +return; +} + +if ((value != 0) (value 0x4000)) { +value += 0x4000; +} +cpu-env.cpuid_hv_level = value; +cpu-env.cpuid_hv_level_set = true; +} + #if !defined(CONFIG_USER_ONLY) static void x86_get_hv_spinlocks(Object *obj, Visitor *v, void *opaque, const char *name, Error **errp) @@ -2061,6 +2087,9 @@ static void x86_cpu_initfn(Object *obj) object_property_add(obj, enforce, bool, x86_cpuid_get_enforce, x86_cpuid_set_enforce, NULL, NULL, NULL); +object_property_add(obj, hypervisor-level, int, +x86_cpuid_get_hv_level, +x86_cpuid_set_hv_level, NULL, NULL, NULL); #if !defined(CONFIG_USER_ONLY) object_property_add(obj, hv_spinlocks, int, x86_get_hv_spinlocks, -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 05/17] target-i386: Add cpu object access routines for Hypervisor level.
These are modeled after x86_cpuid_get_xlevel and x86_cpuid_set_xlevel. Signed-off-by: Don Slutz d...@cloudswitch.com --- target-i386/cpu.c |8 target-i386/cpu.h |2 ++ 2 files changed, 10 insertions(+), 0 deletions(-) diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 7b31de9..e5deca2 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -1193,6 +1193,11 @@ static void x86_cpuid_set_hv_level(Object *obj, Visitor *v, void *opaque, } #if !defined(CONFIG_USER_ONLY) +static void x86_set_hyperv(Object *obj, Error **errp) +{ +object_property_set_int(obj, CPUID_HV_LEVEL_HYPERV, hypervisor-level, errp); +} + static void x86_get_hv_spinlocks(Object *obj, Visitor *v, void *opaque, const char *name, Error **errp) { @@ -1215,6 +1220,7 @@ static void x86_set_hv_spinlocks(Object *obj, Visitor *v, void *opaque, return; } hyperv_set_spinlock_retries(value); +x86_set_hyperv(obj, errp); } static void x86_get_hv_relaxed(Object *obj, Visitor *v, void *opaque, @@ -1235,6 +1241,7 @@ static void x86_set_hv_relaxed(Object *obj, Visitor *v, void *opaque, return; } hyperv_enable_relaxed_timing(value); +x86_set_hyperv(obj, errp); } static void x86_get_hv_vapic(Object *obj, Visitor *v, void *opaque, @@ -1255,6 +1262,7 @@ static void x86_set_hv_vapic(Object *obj, Visitor *v, void *opaque, return; } hyperv_enable_vapic_recommended(value); +x86_set_hyperv(obj, errp); } #endif diff --git a/target-i386/cpu.h b/target-i386/cpu.h index 1899f69..3152a4e 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -488,6 +488,8 @@ #define CPUID_VENDOR_VIA CentaurHauls +#define CPUID_HV_LEVEL_HYPERV 0x4005 + #define CPUID_MWAIT_IBE (1 1) /* Interrupts can exit capability */ #define CPUID_MWAIT_EMX (1 0) /* enumeration supported */ -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 06/17] target-i386: Use Hypervisor level in -machine pc,accel=kvm.
Also known as Paravirtualization level. This change is based on: Microsoft Hypervisor CPUID Leaves: http://msdn.microsoft.com/en-us/library/windows/hardware/ff542428%28v=vs.85%29.aspx Linux kernel change starts with: http://fixunix.com/kernel/538707-use-cpuid-communicate-hypervisor.html Also: http://lkml.indiana.edu/hypermail/linux/kernel/1205.0/00100.html VMware documention on CPUIDs (Mechanisms to determine if software is running in a VMware virtual machine): http://kb.vmware.com/selfservice/microsites/search.do?language=en_UScmd=displayKCexternalId=1009458 QEMU knows this is KVM_CPUID_SIGNATURE (0x4000). Signed-off-by: Don Slutz d...@cloudswitch.com --- target-i386/kvm.c |4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 895d848..8462c75 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -389,12 +389,12 @@ int kvm_arch_init_vcpu(CPUX86State *env) c = cpuid_data.entries[cpuid_i++]; memset(c, 0, sizeof(*c)); c-function = KVM_CPUID_SIGNATURE; -if (!hyperv_enabled()) { +if (!env-cpuid_hv_level_set) { memcpy(signature, KVMKVMKVM\0\0\0, 12); c-eax = 0; } else { memcpy(signature, Microsoft Hv, 12); -c-eax = HYPERV_CPUID_MIN; +c-eax = env-cpuid_hv_level; } c-ebx = signature[0]; c-ecx = signature[1]; -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html