[PATCH v2 2/3] KVM: PPC: Book3S HV: Make a HPTE removal function available
This makes a HPTE removal function, kvmppc_do_h_remove(), available outside book3s_hv_rm_mmu.c. This will be used by the HPT writing code. Signed-off-by: Paul Mackerras pau...@samba.org --- v2: basically unchanged from v1, just rediffed arch/powerpc/include/asm/kvm_book3s.h |3 +++ arch/powerpc/kvm/book3s_hv_rm_mmu.c | 19 +-- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index fea768f..46763d10 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -160,6 +160,9 @@ extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel, pgd_t *pgdir, bool realmode, unsigned long *idx_ret); +extern long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, + unsigned long pte_index, unsigned long avpn, + unsigned long *hpret); extern long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long *map); diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index a96f90a..2334000 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -365,11 +365,10 @@ static inline int try_lock_tlbie(unsigned int *lock) return old == 0; } -long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, -unsigned long pte_index, unsigned long avpn, -unsigned long va) +long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, + unsigned long pte_index, unsigned long avpn, + unsigned long *hpret) { - struct kvm *kvm = vcpu-kvm; unsigned long *hpte; unsigned long v, r, rb; struct revmap_entry *rev; @@ -411,10 +410,18 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, note_hpte_modification(kvm, rev); unlock_hpte(hpte, 0); - vcpu-arch.gpr[4] = v; - vcpu-arch.gpr[5] = r; + hpret[0] = v; + hpret[1] = r; return H_SUCCESS; } +EXPORT_SYMBOL_GPL(kvmppc_do_h_remove); + +long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, +unsigned long pte_index, unsigned long avpn) +{ + return kvmppc_do_h_remove(vcpu-kvm, flags, pte_index, avpn, + vcpu-arch.gpr[4]); +} long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) { -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 1/3] KVM: PPC: Book3S HV: Add a mechanism for recording modified HPTEs
This uses a bit in our record of the guest view of the HPTE to record when the HPTE gets modified. We use a reserved bit for this, and ensure that this bit is always cleared in HPTE values returned to the guest. The recording of modified HPTEs is only done if other code indicates its interest by setting kvm-arch.hpte_mod_interest to a non-zero value. The reason for this is that when later commits add facilities for userspace to read the HPT, the first pass of reading the HPT will be quicker if there are no (or very few) HPTEs marked as modified, rather than having most HPTEs marked as modified. Signed-off-by: Paul Mackerras pau...@samba.org --- v2: added HPTE_GR_RESERVED, clear those bits in H_ENTER arch/powerpc/include/asm/kvm_book3s_64.h |9 + arch/powerpc/include/asm/kvm_host.h |1 + arch/powerpc/kvm/book3s_hv_rm_mmu.c | 28 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 1472a5b..b322e5b 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -50,6 +50,15 @@ extern int kvm_hpt_order;/* order of preallocated HPTs */ #define HPTE_V_HVLOCK 0x40UL #define HPTE_V_ABSENT 0x20UL +/* + * We use this bit in the guest_rpte field of the revmap entry + * to indicate a modified HPTE. + */ +#define HPTE_GR_MODIFIED (1ul 62) + +/* These bits are reserved in the guest view of the HPTE */ +#define HPTE_GR_RESERVED HPTE_GR_MODIFIED + static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits) { unsigned long tmp, old; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 3093896..58c7264 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -248,6 +248,7 @@ struct kvm_arch { atomic_t vcpus_running; unsigned long hpt_npte; unsigned long hpt_mask; + atomic_t hpte_mod_interest; spinlock_t slot_phys_lock; unsigned short last_vcpu[NR_CPUS]; struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 362dffe..a96f90a 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -66,6 +66,17 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, } EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); +/* + * Note modification of an HPTE; set the HPTE modified bit + * if anyone is interested. + */ +static inline void note_hpte_modification(struct kvm *kvm, + struct revmap_entry *rev) +{ + if (atomic_read(kvm-arch.hpte_mod_interest)) + rev-guest_rpte |= HPTE_GR_MODIFIED; +} + /* Remove this HPTE from the chain for a real page */ static void remove_revmap_chain(struct kvm *kvm, long pte_index, struct revmap_entry *rev, @@ -138,7 +149,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, unsigned long slot_fn, hva; unsigned long *hpte; struct revmap_entry *rev; - unsigned long g_ptel = ptel; + unsigned long g_ptel; struct kvm_memory_slot *memslot; unsigned long *physp, pte_size; unsigned long is_io; @@ -153,6 +164,8 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, return H_PARAMETER; writing = hpte_is_writable(ptel); pteh = ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); + ptel = ~HPTE_GR_RESERVED; + g_ptel = ptel; /* used later to detect if we might have been invalidated */ mmu_seq = kvm-mmu_notifier_seq; @@ -287,8 +300,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, rev = kvm-arch.revmap[pte_index]; if (realmode) rev = real_vmalloc_addr(rev); - if (rev) + if (rev) { rev-guest_rpte = g_ptel; + note_hpte_modification(kvm, rev); + } /* Link HPTE into reverse-map chain */ if (pteh HPTE_V_VALID) { @@ -392,7 +407,8 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, /* Read PTE low word after tlbie to get final R/C values */ remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]); } - r = rev-guest_rpte; + r = rev-guest_rpte ~HPTE_GR_RESERVED; + note_hpte_modification(kvm, rev); unlock_hpte(hpte, 0); vcpu-arch.gpr[4] = v; @@ -466,6 +482,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) args[j] = ((0x80 | flags) 56) + pte_index; rev = real_vmalloc_addr(kvm-arch.revmap[pte_index]); + note_hpte_modification(kvm, rev); if (!(hp[0] HPTE_V_VALID)) {
[PATCH v2 3/3] KVM: PPC: Book3S HV: Provide a method for userspace to read and write the HPT
A new ioctl, KVM_PPC_GET_HTAB_FD, returns a file descriptor. Reads on this fd return the contents of the HPT (hashed page table), writes create and/or remove entries in the HPT. There is a new capability, KVM_CAP_PPC_HTAB_FD, to indicate the presence of the ioctl. The ioctl takes an argument structure with the index of the first HPT entry to read out and a set of flags. The flags indicate whether the user is intending to read or write the HPT, and whether to return all entries or only the bolted entries (those with the bolted bit, 0x10, set in the first doubleword). This is intended for use in implementing qemu's savevm/loadvm and for live migration. Therefore, on reads, the first pass returns information about all HPTEs (or all bolted HPTEs). When the first pass reaches the end of the HPT, it returns from the read. Subsequent reads only return information about HPTEs that have changed since they were last read. A read that finds no changed HPTEs in the HPT following where the last read finished will return 0 bytes. The format of the data provides a simple run-length compression of the invalid entries. Each block of data starts with a header that indicates the index (position in the HPT, which is just an array), the number of valid entries starting at that index (may be zero), and the number of invalid entries following those valid entries. The valid entries, 16 bytes each, follow the header. The invalid entries are not explicitly represented. Signed-off-by: Paul Mackerras pau...@samba.org --- v2: added comments, added reserved field in struct kvm_get_htab_fd Documentation/virtual/kvm/api.txt| 53 + arch/powerpc/include/asm/kvm_book3s_64.h | 22 ++ arch/powerpc/include/asm/kvm_ppc.h |2 + arch/powerpc/include/uapi/asm/kvm.h | 25 +++ arch/powerpc/kvm/book3s_64_mmu_hv.c | 344 ++ arch/powerpc/kvm/book3s_hv.c | 12 -- arch/powerpc/kvm/powerpc.c | 17 ++ include/uapi/linux/kvm.h |3 + 8 files changed, 466 insertions(+), 12 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 6671fdc..33080ea 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2071,6 +2071,59 @@ KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm Note that the vcpu ioctl is asynchronous to vcpu execution. +4.78 KVM_PPC_GET_HTAB_FD + +Capability: KVM_CAP_PPC_HTAB_FD +Architectures: powerpc +Type: vm ioctl +Parameters: Pointer to struct kvm_get_htab_fd (in) +Returns: file descriptor number (= 0) on success, -1 on error + +This returns a file descriptor that can be used either to read out the +entries in the guest's hashed page table (HPT), or to write entries to +initialize the HPT. The returned fd can only be written to if the +KVM_GET_HTAB_WRITE bit is set in the flags field of the argument, and +can only be read if that bit is clear. The argument struct looks like +this: + +/* For KVM_PPC_GET_HTAB_FD */ +struct kvm_get_htab_fd { + __u64 flags; + __u64 start_index; +}; + +/* Values for kvm_get_htab_fd.flags */ +#define KVM_GET_HTAB_BOLTED_ONLY ((__u64)0x1) +#define KVM_GET_HTAB_WRITE ((__u64)0x2) + +The `start_index' field gives the index in the HPT of the entry at +which to start reading. It is ignored when writing. + +Reads on the fd will initially supply information about all +interesting HPT entries. Interesting entries are those with the +bolted bit set, if the KVM_GET_HTAB_BOLTED_ONLY bit is set, otherwise +all entries. When the end of the HPT is reached, the read() will +return. If read() is called again on the fd, it will start again from +the beginning of the HPT, but will only return HPT entries that have +changed since they were last read. + +Data read or written is structured as a header (8 bytes) followed by a +series of valid HPT entries (16 bytes) each. The header indicates how +many valid HPT entries there are and how many invalid entries follow +the valid entries. The invalid entries are not represented explicitly +in the stream. The header format is: + +struct kvm_get_htab_header { + __u32 index; + __u16 n_valid; + __u16 n_invalid; +}; + +Writes to the fd create HPT entries starting at the index given in the +header; first `n_valid' valid entries with contents from the data +written, then `n_invalid' invalid entries, invalidating any previously +valid entries found. + 5. The kvm_run structure diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index b322e5b..38bec1d 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -246,4 +246,26 @@ static inline bool slot_is_aligned(struct kvm_memory_slot *memslot, return !(memslot-base_gfn mask) !(memslot-npages mask); } +/* + * This
Re: [patch 18/18] KVM: x86: update pvclock area conditionally, on cpu migration
On 11/20/2012 01:58 AM, Marcelo Tosatti wrote: As requested by Glauber, do not update kvmclock area on vcpu-pcpu migration, in case the host has stable TSC. This is to reduce cacheline bouncing. Signed-off-by: Marcelo Tosatti mtosa...@redhat.com Index: vsyscall/arch/x86/kvm/x86.c === --- vsyscall.orig/arch/x86/kvm/x86.c +++ vsyscall/arch/x86/kvm/x86.c @@ -2615,7 +2615,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu kvm_x86_ops-write_tsc_offset(vcpu, offset); vcpu-arch.tsc_catchup = 1; } - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + /* + * On a host with synchronized TSC, there is no need to update + * kvmclock on vcpu-cpu migration + */ + if (!vcpu-kvm-arch.use_master_clock || vcpu-cpu == -1) + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); if (vcpu-cpu != cpu) kvm_migrate_timers(vcpu); vcpu-cpu = cpu; Ok. Since you are only touching the one in kvm_arch_vcpu_load() and leaving the others untouched, it looks correct. Acked-by: Glauber Costa glom...@parallels.com -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] KVM: PPC: Book3S HV: Fix bug causing loss of page dirty state
This fixes a bug where adding a new guest HPT entry via the H_ENTER hcall would lose the changed bit in the reverse map information for the guest physical page being mapped. The result was that the KVM_GET_DIRTY_LOG could return a zero bit for the page even though the page had been modified by the guest. This fixes it by only modifying the index and present bits in the reverse map entry, thus preserving the reference and change bits. We were also unnecessarily setting the reference bit, and this fixes that too. Signed-off-by: Paul Mackerras pau...@samba.org --- This is against Alex Graf's kvm-ppc-next branch plus the series of three patches I just sent, but it should be independent of that series. arch/powerpc/kvm/book3s_hv_rm_mmu.c |6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 2334000..fc3da32 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -59,10 +59,10 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, head-back = pte_index; } else { rev-forw = rev-back = pte_index; - i = pte_index; + *rmap = (*rmap ~KVMPPC_RMAP_INDEX) | + pte_index | KVMPPC_RMAP_PRESENT; } - smp_wmb(); - *rmap = i | KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT; /* unlock */ + unlock_rmap(rmap); } EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch 00/18] pvclock vsyscall support + KVM hypervisor support (v5)
On 11/20/2012 01:57 AM, Marcelo Tosatti wrote: This patchset, based on earlier work by Jeremy Fitzhardinge, implements paravirtual clock vsyscall support. It should be possible to implement Xen support relatively easily. It reduces clock_gettime from 500 cycles to 200 cycles on my testbox. There are no more significant objections from my side. I will still try to go through it again today just in case. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: PPC: Book3S HV: Fix bug causing loss of page dirty state
On 20.11.2012, at 10:01, Paul Mackerras wrote: This fixes a bug where adding a new guest HPT entry via the H_ENTER hcall would lose the changed bit in the reverse map information for the guest physical page being mapped. The result was that the KVM_GET_DIRTY_LOG could return a zero bit for the page even though the page had been modified by the guest. This fixes it by only modifying the index and present bits in the reverse map entry, thus preserving the reference and change bits. We were also unnecessarily setting the reference bit, and this fixes that too. Signed-off-by: Paul Mackerras pau...@samba.org Thanks, applied to kvm-ppc-next. Alex --- This is against Alex Graf's kvm-ppc-next branch plus the series of three patches I just sent, but it should be independent of that series. arch/powerpc/kvm/book3s_hv_rm_mmu.c |6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 2334000..fc3da32 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -59,10 +59,10 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, head-back = pte_index; } else { rev-forw = rev-back = pte_index; - i = pte_index; + *rmap = (*rmap ~KVMPPC_RMAP_INDEX) | + pte_index | KVMPPC_RMAP_PRESENT; } - smp_wmb(); - *rmap = i | KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT; /* unlock */ + unlock_rmap(rmap); } EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 3/3] KVM: PPC: Book3S HV: Provide a method for userspace to read and write the HPT
On 20.11.2012, at 09:57, Paul Mackerras wrote: A new ioctl, KVM_PPC_GET_HTAB_FD, returns a file descriptor. Reads on this fd return the contents of the HPT (hashed page table), writes create and/or remove entries in the HPT. There is a new capability, KVM_CAP_PPC_HTAB_FD, to indicate the presence of the ioctl. The ioctl takes an argument structure with the index of the first HPT entry to read out and a set of flags. The flags indicate whether the user is intending to read or write the HPT, and whether to return all entries or only the bolted entries (those with the bolted bit, 0x10, set in the first doubleword). This is intended for use in implementing qemu's savevm/loadvm and for live migration. Therefore, on reads, the first pass returns information about all HPTEs (or all bolted HPTEs). When the first pass reaches the end of the HPT, it returns from the read. Subsequent reads only return information about HPTEs that have changed since they were last read. A read that finds no changed HPTEs in the HPT following where the last read finished will return 0 bytes. The format of the data provides a simple run-length compression of the invalid entries. Each block of data starts with a header that indicates the index (position in the HPT, which is just an array), the number of valid entries starting at that index (may be zero), and the number of invalid entries following those valid entries. The valid entries, 16 bytes each, follow the header. The invalid entries are not explicitly represented. Signed-off-by: Paul Mackerras pau...@samba.org --- v2: added comments, added reserved field in struct kvm_get_htab_fd Documentation/virtual/kvm/api.txt| 53 + arch/powerpc/include/asm/kvm_book3s_64.h | 22 ++ arch/powerpc/include/asm/kvm_ppc.h |2 + arch/powerpc/include/uapi/asm/kvm.h | 25 +++ arch/powerpc/kvm/book3s_64_mmu_hv.c | 344 ++ arch/powerpc/kvm/book3s_hv.c | 12 -- arch/powerpc/kvm/powerpc.c | 17 ++ include/uapi/linux/kvm.h |3 + 8 files changed, 466 insertions(+), 12 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 6671fdc..33080ea 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2071,6 +2071,59 @@ KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm Note that the vcpu ioctl is asynchronous to vcpu execution. +4.78 KVM_PPC_GET_HTAB_FD + +Capability: KVM_CAP_PPC_HTAB_FD +Architectures: powerpc +Type: vm ioctl +Parameters: Pointer to struct kvm_get_htab_fd (in) +Returns: file descriptor number (= 0) on success, -1 on error + +This returns a file descriptor that can be used either to read out the +entries in the guest's hashed page table (HPT), or to write entries to +initialize the HPT. The returned fd can only be written to if the +KVM_GET_HTAB_WRITE bit is set in the flags field of the argument, and +can only be read if that bit is clear. The argument struct looks like +this: + +/* For KVM_PPC_GET_HTAB_FD */ +struct kvm_get_htab_fd { + __u64 flags; + __u64 start_index; Documentation is out of sync :). Applied all 3 with fixed documentation. Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] s390: Virtual channel subsystem support.
On Mon, 19 Nov 2012 14:30:00 +0100 Alexander Graf ag...@suse.de wrote: On 31.10.2012, at 17:24, Cornelia Huck wrote: Provide a mechanism for qemu to provide fully virtual subchannels to the guest. In the KVM case, this relies on the kernel's css support for I/O and machine check interrupt handling. The !KVM case handles interrupts on its own. Signed-off-by: Cornelia Huck cornelia.h...@de.ibm.com --- hw/s390x/Makefile.objs |1 + hw/s390x/css.c | 1209 hw/s390x/css.h | 90 target-s390x/Makefile.objs |2 +- target-s390x/cpu.h | 232 + target-s390x/helper.c | 146 ++ target-s390x/ioinst.c | 737 +++ target-s390x/ioinst.h | 213 target-s390x/kvm.c | 251 - target-s390x/misc_helper.c |6 +- 10 files changed, 2872 insertions(+), 15 deletions(-) create mode 100644 hw/s390x/css.c create mode 100644 hw/s390x/css.h create mode 100644 target-s390x/ioinst.c create mode 100644 target-s390x/ioinst.h diff --git a/hw/s390x/Makefile.objs b/hw/s390x/Makefile.objs index 096dfcd..378b099 100644 --- a/hw/s390x/Makefile.objs +++ b/hw/s390x/Makefile.objs @@ -4,3 +4,4 @@ obj-y := $(addprefix ../,$(obj-y)) obj-y += sclp.o obj-y += event-facility.o obj-y += sclpquiesce.o sclpconsole.o +obj-y += css.o diff --git a/hw/s390x/css.c b/hw/s390x/css.c new file mode 100644 index 000..9adffb3 --- /dev/null +++ b/hw/s390x/css.c @@ -0,0 +1,1209 @@ +/* + * Channel subsystem base support. + * + * Copyright 2012 IBM Corp. + * Author(s): Cornelia Huck cornelia.h...@de.ibm.com + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at + * your option) any later version. See the COPYING file in the top-level + * directory. + */ + +#include qemu-thread.h +#include qemu-queue.h +#include hw/qdev.h +#include bitops.h +#include kvm.h +#include cpu.h +#include ioinst.h +#include css.h +#include virtio-ccw.h + +typedef struct CrwContainer { +CRW crw; +QTAILQ_ENTRY(CrwContainer) sibling; +} CrwContainer; + +typedef struct ChpInfo { +uint8_t in_use; +uint8_t type; +uint8_t is_virtual; +} ChpInfo; + +typedef struct SubchSet { +SubchDev *sch[MAX_SCHID + 1]; +unsigned long schids_used[BITS_TO_LONGS(MAX_SCHID + 1)]; +unsigned long devnos_used[BITS_TO_LONGS(MAX_SCHID + 1)]; +} SubchSet; + +typedef struct CssImage { +SubchSet *sch_set[MAX_SSID + 1]; +ChpInfo chpids[MAX_CHPID + 1]; +} CssImage; + +typedef struct ChannelSubSys { +QTAILQ_HEAD(, CrwContainer) pending_crws; +bool do_crw_mchk; +bool crws_lost; +uint8_t max_cssid; +uint8_t max_ssid; +bool chnmon_active; +uint64_t chnmon_area; +CssImage *css[MAX_CSSID + 1]; +uint8_t default_cssid; +} ChannelSubSys; + +static ChannelSubSys *channel_subsys; + +int css_create_css_image(uint8_t cssid, bool default_image) +{ +if (cssid MAX_CSSID) { +return -EINVAL; +} +if (channel_subsys-css[cssid]) { +return -EBUSY; +} +channel_subsys-css[cssid] = g_try_malloc0(sizeof(CssImage)); +if (!channel_subsys-css[cssid]) { +return -ENOMEM; +} +if (default_image) { +channel_subsys-default_cssid = cssid; +} +return 0; +} + +static void css_write_phys_pmcw(uint64_t addr, PMCW *pmcw) +{ +int i; +uint32_t offset = 0; +struct copy_pmcw { +uint32_t intparm; +uint16_t flags; +uint16_t devno; +uint8_t lpm; +uint8_t pnom; +uint8_t lpum; +uint8_t pim; +uint16_t mbi; +uint8_t pom; +uint8_t pam; +uint8_t chpid[8]; +uint32_t chars; +} *copy; This needs to be packed. Also, it might be a good idea to separate the struct definition from the actual code ;). + +copy = (struct copy_pmcw *)pmcw; This will break on any system that doesn't coincidently stick to the same bitfield order as s390x. Please drop any usage of bitfields in QEMU source code :). +stl_phys(addr + offset, copy-intparm); +offset += sizeof(copy-intparm); Can't you just use cpu_physical_memory_map() and assign things left and right as you see fit? Or prepare the target endianness struct on the stack and cpu_physical_memory_read/write it from/to guest memory. All that copying stuff (other places as well) was still on my todo list - just wanted to get the patches out of the door so people could take a look at the interface. Also, please split this patch into smaller patches :). As it is now it's very hard to review. However, apart from the above issues (which may happen in other places of the code further down, I just
Re: [kvmarm] [PATCH v4 09/14] KVM: ARM: Emulation framework and CP15 emulation
Peter Maydell peter.mayd...@linaro.org writes: On 19 November 2012 15:01, Will Deacon will.dea...@arm.com wrote: On Sat, Nov 10, 2012 at 03:43:13PM +, Christoffer Dall wrote: +/* + * A15-specific CP15 registers. + * Important: Must be sorted ascending by CRn, CRM, Op1, Op2 + */ +static const struct coproc_reg a15_regs[] = { + /* MPIDR: we use VMPIDR for guest access. */ + { CRn( 0), CRm( 0), Op1( 0), Op2( 5), is32, + NULL, reset_mpidr, c0_MPIDR }, + + /* SCTLR: swapped by interrupt.S. */ + { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32, + NULL, reset_val, c1_SCTLR, 0x00C50078 }, Why is the SCTLR included here as an A15-specific register? Rusty might remember the exact answer, but probably because the SCTLR reset value is IMPDEF. Indeed... Cheers, Rusty. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 0/2] x86: clear vmcss on all cpus when doing kdump if necessary
于 2012年11月20日 08:32, Marcelo Tosatti 写道: On Fri, Nov 16, 2012 at 06:12:58PM +0800, zhangyanfei wrote: Hello Marcelo, Any thoughts? I thought a function call was OK, but its better to have all code in vmx.c. Please have an atomic notifier in kexec.c (registered by KVM module via atomic_notifier_chain_register etc). Other than that, which is largely cosmetic, it looks fine. Sorry for not expressing this earlier. Hmm, Thanks. I will resend a new patch set. Thanks Zhang -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v11] kvm: notify host when the guest is panicked
Hi Marcelo, On Tue, Nov 13, 2012 at 12:19:08AM -0200, Marcelo Tosatti wrote: On Fri, Nov 09, 2012 at 03:17:39PM -0500, Sasha Levin wrote: On Mon, Nov 5, 2012 at 8:58 PM, Hu Tao hu...@cn.fujitsu.com wrote: But in the case of panic notification, more dependency means more chances of failure of panic notification. Say, if we use a virtio device to do panic notification, then we will fail if: virtio itself has problems, virtio for some reason can't be deployed(neither built-in or as a module), or guest doesn't support virtio, etc. Add polling to your virtio device. If it didn't notify of a panic but taking more than 20 sec to answer your poll request you can assume it's dead. Actually, just use virtio-serial and something in userspace on the guest. They want the guest to stop, so a memory dump can be taken by management interface. Hu Tao, lets assume port I/O is the preferred method for communication. Okey. Now, the following comments have still not been addressed: 1) Lifecycle of the stopped guest and interaction with other stopped states in QEMU. Patch 3 already deals with run state transitions. But in case I'm missing something, could you be more specific? 2) Format of the interface for other architectures (you can choose a different KVM supported architecture and write an example). 3) Clear/documented management interface for the feature. It is documented in patch 0: Documentation/virtual/kvm/pv_event.txt. Does it need to be improved? -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Re: Re: [RFC PATCH 0/2] kvm/vmx: Output TSC offset
Hi Marcelo, Sorry for the late reply. (2012/11/17 4:15), Marcelo Tosatti wrote: On Wed, Nov 14, 2012 at 05:26:10PM +0900, Yoshihiro YUNOMAE wrote: Thank you for commenting on my patch set. (2012/11/14 11:31), Steven Rostedt wrote: On Tue, 2012-11-13 at 18:03 -0800, David Sharp wrote: On Tue, Nov 13, 2012 at 6:00 PM, Steven Rostedt rost...@goodmis.org wrote: On Wed, 2012-11-14 at 10:36 +0900, Yoshihiro YUNOMAE wrote: To merge the data like previous pattern, we apply this patch set. Then, we can get TSC offset of the guest as follows: $ dmesg | grep kvm [ 57.717180] kvm: (2687) write TSC offset 18446743360465545001, now clock ## | PID TSC offset | HOST TSC value --+ Using printk to export something like this is IMO a nasty hack. Can't we create a /sys or /proc file to export the same thing? Since the value changes over the course of the trace, and seems to be part of the context of the trace, I think I'd include it as a tracepoint. I'm fine with that too. Using some tracepoint is a nice idea, but there is one problem. Here, our discussion point is the event which TSC offset is changed does not frequently occur, but the buffer must keep the event data. There are two ideas for using tracepoint. First, we define new tracepoint for changed TSC offset. This is simple and the overhead will be low. However, this trace event stored in the buffer will be overwritten by other trace events because this TSC offset event does not frequently occur. Second, we add TSC offset information to the tracepoint frequently occured. For example, we assume that TSC offset information is added to arguments of trace_kvm_exit(). The TSC offset is in the host trace. So given a host trace with two TSC offset updates, how do you know which events in the guest trace (containing a number of events) refer to which tsc offset update? Unless i am missing something, you can't solve this easily (well, except exporting information to the guest that allows it to transform RDTSC - host TSC value, which can be done via pvclock). As you say, TSC offset events are in the host trace, but we don't need to notify guests of updating TSC offset. The offset event will output the next TSC offset value and the current TSC value, so we can calculate the guest TSC (T1) for the event. Guest TSCs since T1 can be converted to host TSC using the TSC offset, so we can integrate those trace data. Another issue as mentioned is lack of TSC synchronization in the host. Should you provide such a feature without the possibility of proper chronological order on systems with unsynchronized TSC? I think, we cannot support this sorting feature using TSC on systems with unsynchronized TSC. On systems with unsynchronized TSC, it is difficult to sort not only trace data of guests and the host but trace data of a guest or a host using TSC in chronological order. Actually, if we want to output tracing data of ftrace in chronological order with unsynchronized TSC, we will use the global mode as the timestamp. The global mode uses wallclock added TSC correction, so the mode guarantees to sort in chronological order for trace data of the guest or of the host. If we use this mode to sort the trace data of guests and the host in chronological order, we need to consider about the difference between the guest and the host and timekeeping of guests and the host, so it is difficult to solve these issues. At least, I haven't came up with the good solution. We cannot sort the trace data of guests and the host in chronological order with unsynchronized TSC, but if we can set following synchronization events for both guests and the host, we will know where we should sort. First, a guest and the host uses the global mode as the timestamp of ftrace. Next, a user on the guest writes 1 to the synchronization I/F as the ID, then the synchronization event 1 is recorded in a ring-buffer of the guest. The synchronization operation induces hypercall, so the host can handle the event. After the operation moves to the host, the host records the event 1 in a ring-buffer of the host. In the end, the operation returns to the host, and the synchronization is finished. When we integrate tracing data of the guest and the host, we calculate difference of the timestamp between the synchronizing events with the same ID. This value is a temporary offset. We will convert the timestamp of the guests to the timestamp of the host before the next synchronizing event. If the synchronizing event cycle is very short, we will not need to consider the timekeeping. Then, we can sort the trace data in chronological order. Would you comment for this or do you have another idea? Thanks, -- Yoshihiro YUNOMAE Software Platform Research Dept. Linux Technology Center Hitachi, Ltd., Yokohama
Re: Re: Re: [RFC PATCH 0/2] kvm/vmx: Output TSC offset
Hi Steven, Sorry for the late reply. (2012/11/17 0:05), Steven Rostedt wrote: On Wed, 2012-11-14 at 17:26 +0900, Yoshihiro YUNOMAE wrote: Thank you for commenting on my patch set. (2012/11/14 11:31), Steven Rostedt wrote: On Tue, 2012-11-13 at 18:03 -0800, David Sharp wrote: On Tue, Nov 13, 2012 at 6:00 PM, Steven Rostedt rost...@goodmis.org wrote: On Wed, 2012-11-14 at 10:36 +0900, Yoshihiro YUNOMAE wrote: To merge the data like previous pattern, we apply this patch set. Then, we can get TSC offset of the guest as follows: $ dmesg | grep kvm [ 57.717180] kvm: (2687) write TSC offset 18446743360465545001, now clock ## | PID TSC offset | HOST TSC value --+ Using printk to export something like this is IMO a nasty hack. Can't we create a /sys or /proc file to export the same thing? Since the value changes over the course of the trace, and seems to be part of the context of the trace, I think I'd include it as a tracepoint. I'm fine with that too. Using some tracepoint is a nice idea, but there is one problem. Here, our discussion point is the event which TSC offset is changed does not frequently occur, but the buffer must keep the event data. If you can hold off a bit, for the 3.9 window, I plan on pushing multiple buffers for ftrace. That is, you can create a separate buffer just for the TSC offset events: cd /sys/kernel/debug echo tsc instances/new echo 1 instances/tsc/events/tsc/offset/enable Then the buffer will be used only for that event. That's good. The tracepoint will output as follows: qemu-kvm-12345 [000] 123456789: kvm_write_tsc_offset: now_tsc=123456789 previous_offset=0 next_offset=123456780 Thanks, -- Yoshihiro YUNOMAE Software Platform Research Dept. Linux Technology Center Hitachi, Ltd., Yokohama Research Laboratory E-mail: yoshihiro.yunomae...@hitachi.com -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 3/3] KVM: PPC: Book3S HV: Provide a method for userspace to read and write the HPT
On Tue, Nov 20, 2012 at 10:16:24AM +0100, Alexander Graf wrote: Documentation is out of sync :). Oops, sorry... :) Applied all 3 with fixed documentation. Great, thanks. Regards, Paul. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/3] KVM: x86: clean up reexecute_instruction
On Tue, Nov 20, 2012 at 07:58:32AM +0800, Xiao Guangrong wrote: Little cleanup for reexecute_instruction, also use gpa_to_gfn in retry_instruction Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/x86.c | 13 ++--- 1 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 52ae8b5..7be8452 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4477,19 +4477,18 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) if (tdp_enabled) return false; + gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); + if (gpa == UNMAPPED_GVA) + return true; /* let cpu generate fault */ + /* * if emulation was due to access to shadowed page table * and it failed try to unshadow page and re-enter the * guest to let CPU execute the instruction. */ - if (kvm_mmu_unprotect_page_virt(vcpu, gva)) + if (kvm_mmu_unprotect_page(vcpu-kvm, c(gpa))) What's c()? Should be gpa_to_gfn(gpa)? return true; - gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL); - - if (gpa == UNMAPPED_GVA) - return true; /* let cpu generate fault */ - /* * Do not retry the unhandleable instruction if it faults on the * readonly host memory, otherwise it will goto a infinite loop: @@ -4544,7 +4543,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, if (!vcpu-arch.mmu.direct_map) gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); - kvm_mmu_unprotect_page(vcpu-kvm, gpa PAGE_SHIFT); + kvm_mmu_unprotect_page(vcpu-kvm, gpa_to_gfn(gpa)); return true; } -- 1.7.7.6 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] KVM: Retire as maintainer
After six and a half years of writing and maintaining KVM, it is time to move to new things. Update my MAINTAINERS entry to reflect that. Signed-off-by: Avi Kivity a...@redhat.com --- CREDITS | 5 + MAINTAINERS | 1 - 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index d8fe12a..2346b09 100644 --- a/CREDITS +++ b/CREDITS @@ -1823,6 +1823,11 @@ S: Kattreinstr 38 S: D-64295 S: Germany +N: Avi Kivity +E: avi.kiv...@gmail.com +D: Kernel-based Virtual Machine (KVM) +S: Ra'annana, Israel + N: Andi Kleen E: a...@firstfloor.org U: http://www.halobates.de diff --git a/MAINTAINERS b/MAINTAINERS index bb0b27d..6b5b0b7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4280,7 +4280,6 @@ F:include/linux/lockd/ F: include/linux/sunrpc/ KERNEL VIRTUAL MACHINE (KVM) -M: Avi Kivity a...@redhat.com M: Marcelo Tosatti mtosa...@redhat.com L: kvm@vger.kernel.org W: http://kvm.qumranet.com -- 1.7.12 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: Retire as maintainer
On 11/20/2012 02:26 PM, Avi Kivity wrote: After six and a half years of writing and maintaining KVM, it is time to move to new things. Update my MAINTAINERS entry to reflect that. Gleb, please send an incremental patch (against kvm.git master) installing yourself as a new co-maintainer. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] KVM: taking co-maintenance
Updating MAINTAINERS file. Signed-off-by: Gleb Natapov g...@redhat.com diff --git a/MAINTAINERS b/MAINTAINERS index 6b5b0b7..b273360 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4281,6 +4281,7 @@ F:include/linux/sunrpc/ KERNEL VIRTUAL MACHINE (KVM) M: Marcelo Tosatti mtosa...@redhat.com +M: Gleb Natapov g...@redhat.com L: kvm@vger.kernel.org W: http://kvm.qumranet.com S: Supported -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: taking co-maintenance
On 11/20/2012 02:37 PM, Gleb Natapov wrote: Updating MAINTAINERS file. Applied, thanks. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[GIT PULL] KVM maintainership update
Linus, please pull from: git://git.kernel.org/pub/scm/virt/kvm/kvm.git master to receive this MAINTAINERS update. After many years of maintaining KVM, I am moving on. It was a real pleasure for me to work with so many talented and dedicated hackers on this project. Replacing me will be one of those talented and dedicated hackers, Gleb, who has authored hundreds of patches in and around KVM. Avi Kivity (1): KVM: Retire as maintainer Gleb Natapov (1): KVM: taking co-maintenance CREDITS | 5 + MAINTAINERS | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index d8fe12a..2346b09 100644 --- a/CREDITS +++ b/CREDITS @@ -1823,6 +1823,11 @@ S: Kattreinstr 38 S: D-64295 S: Germany +N: Avi Kivity +E: avi.kiv...@gmail.com +D: Kernel-based Virtual Machine (KVM) +S: Ra'annana, Israel + N: Andi Kleen E: a...@firstfloor.org U: http://www.halobates.de diff --git a/MAINTAINERS b/MAINTAINERS index bb0b27d..b273360 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4280,8 +4280,8 @@ F:include/linux/lockd/ F: include/linux/sunrpc/ KERNEL VIRTUAL MACHINE (KVM) -M: Avi Kivity a...@redhat.com M: Marcelo Tosatti mtosa...@redhat.com +M: Gleb Natapov g...@redhat.com L: kvm@vger.kernel.org W: http://kvm.qumranet.com S: Supported -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC v4 1/8] linux-headers: Add ARM KVM headers (not for upstream)
This commit adds the ARM KVM headers. This is not to go to QEMU upstream -- the correct path there is that the KVM code will be committed to a mainline upstream kernel, and then upstream QEMU can do a bulk header update from the upstream kernel, which will allow us to drop this temporary commit. This is the result of running update-headers on Christoffer's kvm-arm-v14-vgic-timers branch (commit 68d116f). This commit currently also includes some non-ARM header changes which hopefully will have made it into QEMU upstream by the time we submit this for merging. --- linux-headers/asm-arm/kvm.h | 137 ++ linux-headers/asm-arm/kvm_para.h |1 + linux-headers/asm-generic/kvm_para.h |4 + linux-headers/asm-powerpc/kvm.h | 59 +++ linux-headers/asm-powerpc/kvm_para.h |7 +- linux-headers/linux/kvm.h| 34 +++-- 6 files changed, 234 insertions(+), 8 deletions(-) create mode 100644 linux-headers/asm-arm/kvm.h create mode 100644 linux-headers/asm-arm/kvm_para.h create mode 100644 linux-headers/asm-generic/kvm_para.h diff --git a/linux-headers/asm-arm/kvm.h b/linux-headers/asm-arm/kvm.h new file mode 100644 index 000..b1c7871 --- /dev/null +++ b/linux-headers/asm-arm/kvm.h @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall c.d...@virtualopensystems.com + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef __ARM_KVM_H__ +#define __ARM_KVM_H__ + +#include asm/types.h +#include asm/ptrace.h + +#define __KVM_HAVE_GUEST_DEBUG +#define __KVM_HAVE_IRQ_LINE + +#define KVM_REG_SIZE(id) \ + (1U (((id) KVM_REG_SIZE_MASK) KVM_REG_SIZE_SHIFT)) + +struct kvm_regs { + struct pt_regs usr_regs;/* R0_usr - R14_usr, PC, CPSR */ + __u32 svc_regs[3]; /* SP_svc, LR_svc, SPSR_svc */ + __u32 abt_regs[3]; /* SP_abt, LR_abt, SPSR_abt */ + __u32 und_regs[3]; /* SP_und, LR_und, SPSR_und */ + __u32 irq_regs[3]; /* SP_irq, LR_irq, SPSR_irq */ + __u32 fiq_regs[8]; /* R8_fiq - R14_fiq, SPSR_fiq */ +}; + +/* Supported Processor Types */ +#define KVM_ARM_TARGET_CORTEX_A15 0 +#define KVM_ARM_NUM_TARGETS1 + +/* KVM_SET_DEVICE_ADDRESS ioctl id encoding */ +#define KVM_DEVICE_TYPE_SHIFT 0 +#define KVM_DEVICE_TYPE_MASK (0x KVM_DEVICE_TYPE_SHIFT) +#define KVM_DEVICE_ID_SHIFT16 +#define KVM_DEVICE_ID_MASK (0x KVM_DEVICE_ID_SHIFT) + +/* Supported device IDs */ +#define KVM_ARM_DEVICE_VGIC_V2 0 + +/* Supported VGIC address types */ +#define KVM_VGIC_V2_ADDR_TYPE_DIST 0 +#define KVM_VGIC_V2_ADDR_TYPE_CPU 1 + +struct kvm_vcpu_init { + __u32 target; + __u32 features[7]; +}; + +struct kvm_sregs { +}; + +struct kvm_fpu { +}; + +struct kvm_guest_debug_arch { +}; + +struct kvm_debug_exit_arch { +}; + +struct kvm_sync_regs { +}; + +struct kvm_arch_memory_slot { +}; + +/* If you need to interpret the index values, here is the key: */ +#define KVM_REG_ARM_COPROC_MASK0x0FFF +#define KVM_REG_ARM_COPROC_SHIFT 16 +#define KVM_REG_ARM_32_OPC2_MASK 0x0007 +#define KVM_REG_ARM_32_OPC2_SHIFT 0 +#define KVM_REG_ARM_OPC1_MASK 0x0078 +#define KVM_REG_ARM_OPC1_SHIFT 3 +#define KVM_REG_ARM_CRM_MASK 0x0780 +#define KVM_REG_ARM_CRM_SHIFT 7 +#define KVM_REG_ARM_32_CRN_MASK0x7800 +#define KVM_REG_ARM_32_CRN_SHIFT 11 + +/* Normal registers are mapped as coprocessor 16. */ +#define KVM_REG_ARM_CORE (0x0010 KVM_REG_ARM_COPROC_SHIFT) +#define KVM_REG_ARM_CORE_REG(name) (offsetof(struct kvm_regs, name) / 4) + +/* Some registers need more space to represent values. */ +#define KVM_REG_ARM_DEMUX (0x0011 KVM_REG_ARM_COPROC_SHIFT) +#define KVM_REG_ARM_DEMUX_ID_MASK 0xFF00 +#define KVM_REG_ARM_DEMUX_ID_SHIFT 8 +#define KVM_REG_ARM_DEMUX_ID_CCSIDR(0x00 KVM_REG_ARM_DEMUX_ID_SHIFT) +#define KVM_REG_ARM_DEMUX_VAL_MASK 0x00FF +#define KVM_REG_ARM_DEMUX_VAL_SHIFT0 + +/* VFP registers: we could overload CP10 like ARM does, but that's ugly. */ +#define KVM_REG_ARM_VFP
[RFC v4 8/8] oslib-posix: Align to permit transparent hugepages on ARM Linux
ARM Linux (like x86-64 Linux) can use transparent hugepages for KVM if memory blocks are 2MiB aligned; set QEMU_VMALLOC_ALIGN accordingly. Signed-off-by: Peter Maydell peter.mayd...@linaro.org --- oslib-posix.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oslib-posix.c b/oslib-posix.c index 9db9c3d..d25b52a 100644 --- a/oslib-posix.c +++ b/oslib-posix.c @@ -35,7 +35,7 @@ extern int daemon(int, int); #endif -#if defined(__linux__) defined(__x86_64__) +#if defined(__linux__) (defined(__x86_64__) || defined(__arm__)) /* Use 2 MiB alignment so transparent hugepages can be used by KVM. Valgrind does not support alignments larger than 1 MiB, therefore we need special code which handles running on Valgrind. */ -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC v4 6/8] hw/kvm/arm_gic: Implement support for KVM in-kernel ARM GIC
Implement support for using the KVM in-kernel GIC for ARM. Signed-off-by: Peter Maydell peter.mayd...@linaro.org --- hw/a15mpcore.c |8 ++- hw/arm/Makefile.objs |1 + hw/kvm/arm_gic.c | 169 ++ 3 files changed, 177 insertions(+), 1 deletion(-) create mode 100644 hw/kvm/arm_gic.c diff --git a/hw/a15mpcore.c b/hw/a15mpcore.c index fc0a02a..31158f9 100644 --- a/hw/a15mpcore.c +++ b/hw/a15mpcore.c @@ -19,6 +19,7 @@ */ #include sysbus.h +#include kvm.h /* A15MP private memory region. */ @@ -40,8 +41,13 @@ static int a15mp_priv_init(SysBusDevice *dev) { A15MPPrivState *s = FROM_SYSBUS(A15MPPrivState, dev); SysBusDevice *busdev; +const char *gictype = arm-gic; -s-gic = qdev_create(NULL, arm_gic); +if (kvm_irqchip_in_kernel()) { +gictype = kvm-arm-gic; +} + +s-gic = qdev_create(NULL, gictype); qdev_prop_set_uint32(s-gic, num-cpu, s-num_cpu); qdev_prop_set_uint32(s-gic, num-irq, s-num_irq); qdev_prop_set_uint32(s-gic, revision, 2); diff --git a/hw/arm/Makefile.objs b/hw/arm/Makefile.objs index 6d049e7..38b10a8 100644 --- a/hw/arm/Makefile.objs +++ b/hw/arm/Makefile.objs @@ -31,5 +31,6 @@ obj-y += collie.o obj-y += imx_serial.o imx_ccm.o imx_timer.o imx_avic.o obj-y += kzm.o obj-$(CONFIG_FDT) += ../device_tree.o +obj-$(CONFIG_KVM) += kvm/arm_gic.o obj-y := $(addprefix ../,$(obj-y)) diff --git a/hw/kvm/arm_gic.c b/hw/kvm/arm_gic.c new file mode 100644 index 000..0ad1b8b --- /dev/null +++ b/hw/kvm/arm_gic.c @@ -0,0 +1,169 @@ +/* + * ARM Generic Interrupt Controller using KVM in-kernel support + * + * Copyright (c) 2012 Linaro Limited + * Written by Peter Maydell + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, see http://www.gnu.org/licenses/. + */ + +#include hw/sysbus.h +#include kvm.h +#include kvm_arm.h +#include hw/arm_gic_internal.h + +#define TYPE_KVM_ARM_GIC kvm-arm-gic +#define KVM_ARM_GIC(obj) \ + OBJECT_CHECK(GICState, (obj), TYPE_KVM_ARM_GIC) +#define KVM_ARM_GIC_CLASS(klass) \ + OBJECT_CLASS_CHECK(KVMARMGICClass, (klass), TYPE_KVM_ARM_GIC) +#define KVM_ARM_GIC_GET_CLASS(obj) \ + OBJECT_GET_CLASS(KVMARMGICClass, (obj), TYPE_KVM_ARM_GIC) + +typedef struct KVMARMGICClass { +ARMGICCommonClass parent_class; +int (*parent_init)(SysBusDevice *dev); +void (*parent_reset)(DeviceState *dev); +} KVMARMGICClass; + +static void kvm_arm_gic_set_irq(void *opaque, int irq, int level) +{ +/* Meaning of the 'irq' parameter: + * [0..N-1] : external interrupts + * [N..N+31] : PPI (internal) interrupts for CPU 0 + * [N+32..N+63] : PPI (internal interrupts for CPU 1 + * ... + * Convert this to the kernel's desired encoding, which + * has separate fields in the irq number for type, + * CPU number and interrupt number. + */ +GICState *s = (GICState *)opaque; +int kvm_irq, irqtype, cpu; + +if (irq (s-num_irq - GIC_INTERNAL)) { +/* External interrupt. The kernel numbers these like the GIC + * hardware, with external interrupt IDs starting after the + * internal ones. + */ +irqtype = KVM_ARM_IRQ_TYPE_SPI; +cpu = 0; +irq += GIC_INTERNAL; +} else { +/* Internal interrupt: decode into (cpu, interrupt id) */ +irqtype = KVM_ARM_IRQ_TYPE_PPI; +irq -= (s-num_irq - GIC_INTERNAL); +cpu = irq / GIC_INTERNAL; +irq %= GIC_INTERNAL; +} +kvm_irq = (irqtype KVM_ARM_IRQ_TYPE_SHIFT) +| (cpu KVM_ARM_IRQ_VCPU_SHIFT) | irq; + +kvm_set_irq(kvm_state, kvm_irq, !!level); +} + +static void kvm_arm_gic_put(GICState *s) +{ +/* TODO: there isn't currently a kernel interface to set the GIC state */ +} + +static void kvm_arm_gic_get(GICState *s) +{ +/* TODO: there isn't currently a kernel interface to get the GIC state */ +} + +static void kvm_arm_gic_reset(DeviceState *dev) +{ +GICState *s = ARM_GIC_COMMON(dev); +KVMARMGICClass *kgc = KVM_ARM_GIC_GET_CLASS(s); +kgc-parent_reset(dev); +kvm_arm_gic_put(s); +} + +static int kvm_arm_gic_init(SysBusDevice *dev) +{ +/* Device instance init function for the GIC sysbus device */ +int i; +GICState *s = FROM_SYSBUS(GICState, dev); +KVMARMGICClass *kgc = KVM_ARM_GIC_GET_CLASS(s); + +kgc-parent_init(dev); + +i = s-num_irq - GIC_INTERNAL; +/* For the GIC, also
[RFC v4 0/8] QEMU: Support KVM on ARM
Round 4 of the QEMU patches to support KVM for ARM on Cortex-A15 hardware. It's intended for use with the kernel tree at git://github.com/virtualopensystems/linux-kvm-arm.git kvm-arm-v14-vgic-timers Still RFC pending the kernel patches actually being accepted upstream... Changes v3 to v4: * minor updates to match kernel ABI changes (ID field in kvm_device_address is now 64 bits, core register offsets now changed due to use of pt_regs struct) * squashed the two 'update kernel headers' patches, since the plan is for vgic support to go upstream at the same time as the baseline kernel patchset * added a new patch 8 which adds ARM to the list of Linux archs which prefer 2MB alignment so they can use transparent hugepages Changes v2 to v3: * applied various minor tweaks suggested during review of v2 * rebased on master, resynced with kernel headers for v13 * new patch 6 which uses a MemoryListener to track where the VGIC memory regions are mapped, so we can tell the kernel where they live in the memory map (via new ioctl KVM_SET_DEVICE_ADDRESS) Git tree available at git://git.linaro.org/people/pmaydell/qemu-arm.git kvm-arm-v14 with pointy-clicky interface at http://git.linaro.org/gitweb?p=people/pmaydell/qemu-arm.git;a=shortlog;h=refs/heads/kvm-arm-v14 Christoffer Dall (1): ARM: KVM: Add support for KVM on ARM architecture Peter Maydell (7): linux-headers: Add ARM KVM headers (not for upstream) ARM KVM: save and load VFP registers from kernel hw/arm_gic: Add presave/postload hooks target-arm: Use MemoryListener to identify GIC base address for KVM hw/kvm/arm_gic: Implement support for KVM in-kernel ARM GIC configure: Enable KVM on ARM oslib-posix: Align to permit transparent hugepages on ARM Linux configure|2 +- hw/a15mpcore.c |8 +- hw/arm/Makefile.objs |1 + hw/arm_gic_common.c | 10 + hw/arm_gic_internal.h|2 + hw/arm_pic.c | 26 ++ hw/kvm/arm_gic.c | 169 linux-headers/asm-arm/kvm.h | 137 ++ linux-headers/asm-arm/kvm_para.h |1 + linux-headers/asm-generic/kvm_para.h |4 + linux-headers/asm-powerpc/kvm.h | 59 + linux-headers/asm-powerpc/kvm_para.h |7 +- linux-headers/linux/kvm.h| 34 ++- oslib-posix.c|2 +- target-arm/Makefile.objs |1 + target-arm/cpu.h |1 + target-arm/helper.c |2 +- target-arm/kvm.c | 482 ++ target-arm/kvm_arm.h | 32 +++ 19 files changed, 968 insertions(+), 12 deletions(-) create mode 100644 hw/kvm/arm_gic.c create mode 100644 linux-headers/asm-arm/kvm.h create mode 100644 linux-headers/asm-arm/kvm_para.h create mode 100644 linux-headers/asm-generic/kvm_para.h create mode 100644 target-arm/kvm.c create mode 100644 target-arm/kvm_arm.h -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] vhost-blk: Add vhost-blk support v5
On Tue, Nov 20, 2012 at 02:39:40PM +0800, Asias He wrote: On 11/20/2012 04:26 AM, Michael S. Tsirkin wrote: On Mon, Nov 19, 2012 at 04:53:42PM +0800, Asias He wrote: vhost-blk is an in-kernel virito-blk device accelerator. Due to lack of proper in-kernel AIO interface, this version converts guest's I/O request to bio and use submit_bio() to submit I/O directly. So this version any supports raw block device as guest's disk image, e.g. /dev/sda, /dev/ram0. We can add file based image support to vhost-blk once we have in-kernel AIO interface. There are some work in progress for in-kernel AIO interface from Dave Kleikamp and Zach Brown: http://marc.info/?l=linux-fsdevelm=133312234313122 Performance evaluation: - 1) LKVM Fio with libaio ioengine on Fusion IO device using kvm tool IOPS(k)Before After Improvement seq-read 107 121 +13.0% seq-write 130 179 +37.6% rnd-read 102 122 +19.6% rnd-write 125 159 +27.0% 2) QEMU Fio with libaio ioengine on Fusion IO device using QEMU IOPS(k)Before After Improvement seq-read 76 123 +61.8% seq-write 139 173 +24.4% rnd-read 73 120 +64.3% rnd-write 75 156 +108.0% Could you compare with dataplane qemu as well please? Well, I will try to collect it. Userspace bits: - 1) LKVM The latest vhost-blk userspace bits for kvm tool can be found here: g...@github.com:asias/linux-kvm.git blk.vhost-blk 2) QEMU The latest vhost-blk userspace prototype for QEMU can be found here: g...@github.com:asias/qemu.git blk.vhost-blk Changes in v5: - Do not assume the buffer layout - Fix wakeup race Changes in v4: - Mark req-status as userspace pointer - Use __copy_to_user() instead of copy_to_user() in vhost_blk_set_status() - Add if (need_resched()) schedule() in blk thread - Kill vhost_blk_stop_vq() and move it into vhost_blk_stop() - Use vq_err() instead of pr_warn() - Fail un Unsupported request - Add flush in vhost_blk_set_features() Changes in v3: - Sending REQ_FLUSH bio instead of vfs_fsync, thanks Christoph! - Check file passed by user is a raw block device file Signed-off-by: Asias He as...@redhat.com Since there are files shared by this and vhost net it's easiest for me to merge this all through the vhost tree. Jens, could you ack this and the bio usage in this driver please? --- drivers/vhost/Kconfig | 1 + drivers/vhost/Kconfig.blk | 10 + drivers/vhost/Makefile| 2 + drivers/vhost/blk.c | 697 ++ drivers/vhost/blk.h | 8 + 5 files changed, 718 insertions(+) create mode 100644 drivers/vhost/Kconfig.blk create mode 100644 drivers/vhost/blk.c create mode 100644 drivers/vhost/blk.h diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 202bba6..acd8038 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -11,4 +11,5 @@ config VHOST_NET if STAGING source drivers/vhost/Kconfig.tcm +source drivers/vhost/Kconfig.blk endif diff --git a/drivers/vhost/Kconfig.blk b/drivers/vhost/Kconfig.blk new file mode 100644 index 000..ff8ab76 --- /dev/null +++ b/drivers/vhost/Kconfig.blk @@ -0,0 +1,10 @@ +config VHOST_BLK + tristate Host kernel accelerator for virtio blk (EXPERIMENTAL) + depends on BLOCK EXPERIMENTAL m + ---help--- +This kernel module can be loaded in host kernel to accelerate +guest block with virtio_blk. Not to be confused with virtio_blk +module itself which needs to be loaded in guest kernel. + +To compile this driver as a module, choose M here: the module will +be called vhost_blk. diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile index a27b053..1a8a4a5 100644 --- a/drivers/vhost/Makefile +++ b/drivers/vhost/Makefile @@ -2,3 +2,5 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o vhost_net-y := vhost.o net.o obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o +obj-$(CONFIG_VHOST_BLK) += vhost_blk.o +vhost_blk-y := blk.o diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c new file mode 100644 index 000..f0f118a --- /dev/null +++ b/drivers/vhost/blk.c @@ -0,0 +1,697 @@ +/* + * Copyright (C) 2011 Taobao, Inc. + * Author: Liu Yuan tailai...@taobao.com + * + * Copyright (C) 2012 Red Hat, Inc. + * Author: Asias He as...@redhat.com + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * virtio-blk server in host kernel. + */ + +#include linux/miscdevice.h +#include linux/module.h +#include linux/vhost.h +#include linux/virtio_blk.h +#include linux/mutex.h +#include linux/file.h +#include linux/kthread.h +#include linux/blkdev.h +#include
Re: [PATCH 0/4] AER-KVM: Error containment of PCI pass-thru devices assigned to KVM guests
On Tue, Nov 20, 2012 at 06:31:48AM +, Pandarathil, Vijaymohan R wrote: Add support for error containment when a PCI pass-thru device assigned to a KVM guest encounters an error. This is for PCIe devices/drivers that support AER functionality. When the OS is notified of an error in a device either through the firmware first approach or through an interrupt handled by the AER root port driver, concerned subsystems are notified by invoking callbacks registered by these subsystems. The device is also marked as tainted till the corresponding driver recovery routines are successful. KVM module registers for a notification of such errors. In the KVM callback routine, a global counter is incremented to keep track of the error notification. Before each CPU enters guest mode to execute guest code, appropriate checks are done to see if the impacted device belongs to the guest or not. If the device belongs to the guest, qemu hypervisor for the guest is informed and the guest is immediately brought down, thus preventing or minimizing chances of any bad data being written out by the guest driver after the device has encountered an error. I'm surprised that the hypervisor would shut down the guest when PCIe AER kicks in for a pass-through device. Shouldn't we pass the AER event into the guest and deal with it there? The equivalent to this policy on physical hardware would be that the CPU is reset or the machine is powered down on AER. That doesn't sound right. Stefan -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [kvmarm] [RFC v4 8/8] oslib-posix: Align to permit transparent hugepages on ARM Linux
On 11/20/2012 02:31 PM, Peter Maydell wrote: ARM Linux (like x86-64 Linux) can use transparent hugepages for KVM if memory blocks are 2MiB aligned; set QEMU_VMALLOC_ALIGN accordingly. Signed-off-by: Peter Maydellpeter.mayd...@linaro.org --- oslib-posix.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oslib-posix.c b/oslib-posix.c index 9db9c3d..d25b52a 100644 --- a/oslib-posix.c +++ b/oslib-posix.c @@ -35,7 +35,7 @@ extern int daemon(int, int); #endif -#if defined(__linux__) defined(__x86_64__) +#if defined(__linux__) (defined(__x86_64__) || defined(__arm__)) Why not just drop the arch specific bit? Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC v4 7/8] configure: Enable KVM on ARM
Enable KVM on ARM hosts, now that all the necessary components for it exist. Signed-off-by: Peter Maydell peter.mayd...@linaro.org --- configure |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure b/configure index 780b19a..2438d6d 100755 --- a/configure +++ b/configure @@ -3927,7 +3927,7 @@ case $target_arch2 in echo CONFIG_NO_XEN=y $config_target_mak esac case $target_arch2 in - i386|x86_64|ppcemb|ppc|ppc64|s390x) + arm|i386|x86_64|ppcemb|ppc|ppc64|s390x) # Make sure the target and host cpus are compatible if test $kvm = yes -a $target_softmmu = yes -a \ \( $target_arch2 = $cpu -o \ -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC v4 2/8] ARM: KVM: Add support for KVM on ARM architecture
From: Christoffer Dall cd...@cs.columbia.edu Add basic support for KVM on ARM architecture. Signed-off-by: Christoffer Dall cd...@cs.columbia.edu [PMM: Minor tweaks and code cleanup, switch to ONE_REG] Signed-off-by: Peter Maydell peter.mayd...@linaro.org --- hw/arm_pic.c | 26 target-arm/Makefile.objs |1 + target-arm/cpu.h |1 + target-arm/helper.c |2 +- target-arm/kvm.c | 327 ++ 5 files changed, 356 insertions(+), 1 deletion(-) create mode 100644 target-arm/kvm.c diff --git a/hw/arm_pic.c b/hw/arm_pic.c index ffb4d41..874bbaf 100644 --- a/hw/arm_pic.c +++ b/hw/arm_pic.c @@ -9,6 +9,7 @@ #include hw.h #include arm-misc.h +#include kvm.h /* Input 0 is IRQ and input 1 is FIQ. */ static void arm_pic_cpu_handler(void *opaque, int irq, int level) @@ -34,7 +35,32 @@ static void arm_pic_cpu_handler(void *opaque, int irq, int level) } } +static void kvm_arm_pic_cpu_handler(void *opaque, int irq, int level) +{ +#ifdef CONFIG_KVM +ARMCPU *cpu = opaque; +CPUARMState *env = cpu-env; +int kvm_irq = KVM_ARM_IRQ_TYPE_CPU KVM_ARM_IRQ_TYPE_SHIFT; + +switch (irq) { +case ARM_PIC_CPU_IRQ: +kvm_irq |= KVM_ARM_IRQ_CPU_IRQ; +break; +case ARM_PIC_CPU_FIQ: +kvm_irq |= KVM_ARM_IRQ_CPU_FIQ; +break; +default: +hw_error(kvm_arm_pic_cpu_handler: Bad interrupt line %d\n, irq); +} +kvm_irq |= env-cpu_index KVM_ARM_IRQ_VCPU_SHIFT; +kvm_set_irq(kvm_state, kvm_irq, level ? 1 : 0); +#endif +} + qemu_irq *arm_pic_init_cpu(ARMCPU *cpu) { +if (kvm_enabled()) { +return qemu_allocate_irqs(kvm_arm_pic_cpu_handler, cpu, 2); +} return qemu_allocate_irqs(arm_pic_cpu_handler, cpu, 2); } diff --git a/target-arm/Makefile.objs b/target-arm/Makefile.objs index b6f1a9e..d89b57c 100644 --- a/target-arm/Makefile.objs +++ b/target-arm/Makefile.objs @@ -1,4 +1,5 @@ obj-y += arm-semi.o obj-$(CONFIG_SOFTMMU) += machine.o +obj-$(CONFIG_KVM) += kvm.o obj-y += translate.o op_helper.o helper.o cpu.o obj-y += neon_helper.o iwmmxt_helper.o diff --git a/target-arm/cpu.h b/target-arm/cpu.h index e4ff918..44b5b4f 100644 --- a/target-arm/cpu.h +++ b/target-arm/cpu.h @@ -236,6 +236,7 @@ ARMCPU *cpu_arm_init(const char *cpu_model); void arm_translate_init(void); int cpu_arm_exec(CPUARMState *s); void do_interrupt(CPUARMState *); +int bank_number(CPUARMState *env, int mode); void switch_mode(CPUARMState *, int); uint32_t do_arm_semihosting(CPUARMState *env); diff --git a/target-arm/helper.c b/target-arm/helper.c index ab8b734..6760d76 100644 --- a/target-arm/helper.c +++ b/target-arm/helper.c @@ -1614,7 +1614,7 @@ uint32_t HELPER(get_r13_banked)(CPUARMState *env, uint32_t mode) #else /* Map CPU modes onto saved register banks. */ -static inline int bank_number(CPUARMState *env, int mode) +int bank_number(CPUARMState *env, int mode) { switch (mode) { case ARM_CPU_MODE_USR: diff --git a/target-arm/kvm.c b/target-arm/kvm.c new file mode 100644 index 000..8e4b989 --- /dev/null +++ b/target-arm/kvm.c @@ -0,0 +1,327 @@ +/* + * ARM implementation of KVM hooks + * + * Copyright Christoffer Dall 2009-2010 + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include stdio.h +#include sys/types.h +#include sys/ioctl.h +#include sys/mman.h + +#include linux/kvm.h + +#include qemu-common.h +#include qemu-timer.h +#include sysemu.h +#include kvm.h +#include cpu.h +#include hw/arm-misc.h + +const KVMCapabilityInfo kvm_arch_required_capabilities[] = { +KVM_CAP_LAST_INFO +}; + +int kvm_arch_init(KVMState *s) +{ +/* For ARM interrupt delivery is always asynchronous, + * whether we are using an in-kernel VGIC or not. + */ +kvm_async_interrupts_allowed = true; +return 0; +} + +int kvm_arch_init_vcpu(CPUARMState *env) +{ +struct kvm_vcpu_init init; + +init.target = KVM_ARM_TARGET_CORTEX_A15; +memset(init.features, 0, sizeof(init.features)); +return kvm_vcpu_ioctl(env, KVM_ARM_VCPU_INIT, init); +} + +typedef struct Reg { +uint64_t id; +int offset; +} Reg; + +#define COREREG(KERNELNAME, QEMUFIELD) \ +{\ +KVM_REG_ARM | KVM_REG_SIZE_U32 | \ +KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(KERNELNAME), \ +offsetof(CPUARMState, QEMUFIELD) \ +} + +#define CP15REG(CRN, CRM, OPC1, OPC2, QEMUFIELD) \ +{\ +KVM_REG_ARM | KVM_REG_SIZE_U32 | \ +(15 KVM_REG_ARM_COPROC_SHIFT) | \ +((CRN) KVM_REG_ARM_32_CRN_SHIFT) |\ +((CRM) KVM_REG_ARM_CRM_SHIFT) | \ +((OPC1) KVM_REG_ARM_OPC1_SHIFT) | \ +((OPC2) KVM_REG_ARM_32_OPC2_SHIFT), \ +
[RFC v4 3/8] ARM KVM: save and load VFP registers from kernel
Add support for saving and restoring VFP register state from the kernel. This includes a check that the KVM-created CPU has full VFP support (as the TCG Cortex-A15 model always does), since for the moment ARM QEMU doesn't have any way to tweak optional features on created CPUs. Signed-off-by: Peter Maydell peter.mayd...@linaro.org --- target-arm/kvm.c | 78 +++--- 1 file changed, 75 insertions(+), 3 deletions(-) diff --git a/target-arm/kvm.c b/target-arm/kvm.c index 8e4b989..4217ad6 100644 --- a/target-arm/kvm.c +++ b/target-arm/kvm.c @@ -38,10 +38,28 @@ int kvm_arch_init(KVMState *s) int kvm_arch_init_vcpu(CPUARMState *env) { struct kvm_vcpu_init init; +int ret; +uint64_t v; +struct kvm_one_reg r; init.target = KVM_ARM_TARGET_CORTEX_A15; memset(init.features, 0, sizeof(init.features)); -return kvm_vcpu_ioctl(env, KVM_ARM_VCPU_INIT, init); +ret = kvm_vcpu_ioctl(env, KVM_ARM_VCPU_INIT, init); +if (ret) { +return ret; +} +/* Query the kernel to make sure it supports 32 VFP + * registers: QEMU's cortex-a15 CPU is always a + * VFP-D32 core. The simplest way to do this is just + * to attempt to read register d31. + */ +r.id = KVM_REG_ARM | KVM_REG_SIZE_U64 | KVM_REG_ARM_VFP | 31; +r.addr = (uintptr_t)(v); +ret = kvm_vcpu_ioctl(env, KVM_GET_ONE_REG, r); +if (ret == ENOENT) { +return EINVAL; +} +return ret; } typedef struct Reg { @@ -67,6 +85,13 @@ typedef struct Reg { offsetof(CPUARMState, QEMUFIELD) \ } +#define VFPSYSREG(R) \ +{ \ +KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_VFP | \ +KVM_REG_ARM_VFP_##R, \ +offsetof(CPUARMState, vfp.xregs[ARM_VFP_##R]) \ +} + static const Reg regs[] = { /* R0_usr .. R14_usr */ COREREG(usr_regs.uregs[0], regs[0]), @@ -114,6 +139,13 @@ static const Reg regs[] = { CP15REG(1, 0, 0, 0, cp15.c1_sys), /* SCTLR */ CP15REG(2, 0, 0, 2, cp15.c2_control), /* TTBCR */ CP15REG(3, 0, 0, 0, cp15.c3), /* DACR */ +/* VFP system registers */ +VFPSYSREG(FPSID), +VFPSYSREG(MVFR1), +VFPSYSREG(MVFR0), +VFPSYSREG(FPEXC), +VFPSYSREG(FPINST), +VFPSYSREG(FPINST2), }; int kvm_arch_put_registers(CPUARMState *env, int level) @@ -121,7 +153,7 @@ int kvm_arch_put_registers(CPUARMState *env, int level) struct kvm_one_reg r; int mode, bn; int ret, i; -uint32_t cpsr; +uint32_t cpsr, fpscr; uint64_t ttbr; /* Make sure the banked regs are properly set */ @@ -172,6 +204,26 @@ int kvm_arch_put_registers(CPUARMState *env, int level) (2 KVM_REG_ARM_CRM_SHIFT) | (1 KVM_REG_ARM_OPC1_SHIFT); r.addr = (uintptr_t)(ttbr); ret = kvm_vcpu_ioctl(env, KVM_SET_ONE_REG, r); +if (ret) { +return ret; +} + +/* VFP registers */ +r.id = KVM_REG_ARM | KVM_REG_SIZE_U64 | KVM_REG_ARM_VFP; +for (i = 0; i 32; i++) { +r.addr = (uintptr_t)(env-vfp.regs[i]); +ret = kvm_vcpu_ioctl(env, KVM_SET_ONE_REG, r); +if (ret) { +return ret; +} +r.id++; +} + +r.id = KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_VFP | +KVM_REG_ARM_VFP_FPSCR; +fpscr = vfp_get_fpscr(env); +r.addr = (uintptr_t)fpscr; +ret = kvm_vcpu_ioctl(env, KVM_SET_ONE_REG, r); return ret; } @@ -181,7 +233,7 @@ int kvm_arch_get_registers(CPUARMState *env) struct kvm_one_reg r; int mode, bn; int ret, i; -uint32_t cpsr; +uint32_t cpsr, fpscr; uint64_t ttbr; for (i = 0; i ARRAY_SIZE(regs); i++) { @@ -246,6 +298,26 @@ int kvm_arch_get_registers(CPUARMState *env) env-cp15.c2_mask = ~(0xu env-cp15.c2_control); env-cp15.c2_base_mask = ~(0x3fffu env-cp15.c2_control); +/* VFP registers */ +r.id = KVM_REG_ARM | KVM_REG_SIZE_U64 | KVM_REG_ARM_VFP; +for (i = 0; i 32; i++) { +r.addr = (uintptr_t)(env-vfp.regs[i]); +ret = kvm_vcpu_ioctl(env, KVM_GET_ONE_REG, r); +if (ret) { +return ret; +} +r.id++; +} + +r.id = KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_VFP | +KVM_REG_ARM_VFP_FPSCR; +r.addr = (uintptr_t)fpscr; +ret = kvm_vcpu_ioctl(env, KVM_GET_ONE_REG, r); +if (ret) { +return ret; +} +vfp_set_fpscr(env, fpscr); + return 0; } -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC v4 4/8] hw/arm_gic: Add presave/postload hooks
Add presave/postload hooks to the ARM GIC common base class. These will be used by the KVM in-kernel GIC subclass to sync state between kernel and userspace when migrating. Signed-off-by: Peter Maydell peter.mayd...@linaro.org Reviewed-by: Andreas Färber afaer...@suse.de --- hw/arm_gic_common.c | 10 ++ hw/arm_gic_internal.h |2 ++ 2 files changed, 12 insertions(+) diff --git a/hw/arm_gic_common.c b/hw/arm_gic_common.c index 8369309..961b44c 100644 --- a/hw/arm_gic_common.c +++ b/hw/arm_gic_common.c @@ -23,9 +23,14 @@ static void gic_save(QEMUFile *f, void *opaque) { GICState *s = (GICState *)opaque; +ARMGICCommonClass *c = ARM_GIC_COMMON_GET_CLASS(s); int i; int j; +if (c-pre_save) { +c-pre_save(s); +} + qemu_put_be32(f, s-enabled); for (i = 0; i s-num_cpu; i++) { qemu_put_be32(f, s-cpu_enabled[i]); @@ -57,6 +62,7 @@ static void gic_save(QEMUFile *f, void *opaque) static int gic_load(QEMUFile *f, void *opaque, int version_id) { GICState *s = (GICState *)opaque; +ARMGICCommonClass *c = ARM_GIC_COMMON_GET_CLASS(s); int i; int j; @@ -91,6 +97,10 @@ static int gic_load(QEMUFile *f, void *opaque, int version_id) s-irq_state[i].trigger = qemu_get_byte(f); } +if (c-post_load) { +c-post_load(s); +} + return 0; } diff --git a/hw/arm_gic_internal.h b/hw/arm_gic_internal.h index 699352c..3640be0 100644 --- a/hw/arm_gic_internal.h +++ b/hw/arm_gic_internal.h @@ -118,6 +118,8 @@ void gic_init_irqs_and_distributor(GICState *s, int num_irq); typedef struct ARMGICCommonClass { SysBusDeviceClass parent_class; +void (*pre_save)(GICState *s); +void (*post_load)(GICState *s); } ARMGICCommonClass; #define TYPE_ARM_GIC arm_gic -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC v4 5/8] target-arm: Use MemoryListener to identify GIC base address for KVM
When using an in-kernel GIC with KVM, we need to tell the kernel where the GIC's memory mapped registers live. Do this by registering a MemoryListener which tracks where the board model maps the A15's private peripherals, so we can finish the GIC initialisation when the GIC is actually mapped. Signed-off-by: Peter Maydell peter.mayd...@linaro.org --- target-arm/kvm.c | 83 ++ target-arm/kvm_arm.h | 32 +++ 2 files changed, 115 insertions(+) create mode 100644 target-arm/kvm_arm.h diff --git a/target-arm/kvm.c b/target-arm/kvm.c index 4217ad6..ff3007b 100644 --- a/target-arm/kvm.c +++ b/target-arm/kvm.c @@ -19,6 +19,7 @@ #include qemu-timer.h #include sysemu.h #include kvm.h +#include kvm_arm.h #include cpu.h #include hw/arm-misc.h @@ -62,6 +63,88 @@ int kvm_arch_init_vcpu(CPUARMState *env) return ret; } +/* We track all the KVM devices which need their memory addresses + * passing to the kernel in a list of these structures. + * When board init is complete we run through the list and + * tell the kernel the base addresses of the memory regions. + * We use a MemoryListener to track mapping and unmapping of + * the regions during board creation, so the board models don't + * need to do anything special for the KVM case. + */ +typedef struct KVMDevice { +struct kvm_device_address kda; +MemoryRegion *mr; +QSLIST_ENTRY(KVMDevice) entries; +} KVMDevice; + +static QSLIST_HEAD(kvm_devices_head, KVMDevice) kvm_devices_head; + +static void kvm_arm_devlistener_add(MemoryListener *listener, +MemoryRegionSection *section) +{ +KVMDevice *kd; +QSLIST_FOREACH(kd, kvm_devices_head, entries) { +if (section-mr == kd-mr) { +kd-kda.addr = section-offset_within_address_space; +} +} +} + +static void kvm_arm_devlistener_del(MemoryListener *listener, +MemoryRegionSection *section) +{ +KVMDevice *kd; +QSLIST_FOREACH(kd, kvm_devices_head, entries) { +if (section-mr == kd-mr) { +kd-kda.addr = -1; +} +} +} + +static MemoryListener devlistener = { +.region_add = kvm_arm_devlistener_add, +.region_del = kvm_arm_devlistener_del, +}; + +static void kvm_arm_machine_init_done(Notifier *notifier, void *data) +{ +KVMDevice *kd, *tkd; +memory_listener_unregister(devlistener); +QSLIST_FOREACH_SAFE(kd, kvm_devices_head, entries, tkd) { +if (kd-kda.addr != -1) { +if (kvm_vm_ioctl(kvm_state, KVM_SET_DEVICE_ADDRESS, kd-kda) 0) { +fprintf(stderr, KVM_SET_DEVICE_ADDRESS failed: %s\n, +strerror(errno)); +abort(); +} +} +g_free(kd); +} +} + +static Notifier notify = { +.notify = kvm_arm_machine_init_done, +}; + +void kvm_arm_register_device(MemoryRegion *mr, uint64_t devid) +{ +KVMDevice *kd; + +if (!kvm_irqchip_in_kernel()) { +return; +} + +if (QSLIST_EMPTY(kvm_devices_head)) { +memory_listener_register(devlistener, NULL); +qemu_add_machine_init_done_notifier(notify); +} +kd = g_new0(KVMDevice, 1); +kd-mr = mr; +kd-kda.id = devid; +kd-kda.addr = -1; +QSLIST_INSERT_HEAD(kvm_devices_head, kd, entries); +} + typedef struct Reg { uint64_t id; int offset; diff --git a/target-arm/kvm_arm.h b/target-arm/kvm_arm.h new file mode 100644 index 000..2f61d00 --- /dev/null +++ b/target-arm/kvm_arm.h @@ -0,0 +1,32 @@ +/* + * QEMU KVM support -- ARM specific functions. + * + * Copyright (c) 2012 Linaro Limited + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef QEMU_KVM_ARM_H +#define QEMU_KVM_ARM_H + +#include kvm.h +#include memory.h + +/** + * kvm_arm_register_device: + * @mr: memory region for this device + * @devid: the KVM device ID + * + * Remember the memory region @mr, and when it is mapped by the + * machine model, tell the kernel that base address using the + * KVM_SET_DEVICE_ADDRESS ioctl. @devid should be the ID of + * the device as defined by KVM_SET_DEVICE_ADDRESS. + * The machine model may map and unmap the device multiple times; + * the kernel will only be told the final address at the point + * where machine init is complete. + */ +void kvm_arm_register_device(MemoryRegion *mr, uint64_t devid); + +#endif -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [kvmarm] [RFC v4 8/8] oslib-posix: Align to permit transparent hugepages on ARM Linux
On Tue, Nov 20, 2012 at 8:54 AM, Alexander Graf ag...@suse.de wrote: On 11/20/2012 02:31 PM, Peter Maydell wrote: ARM Linux (like x86-64 Linux) can use transparent hugepages for KVM if memory blocks are 2MiB aligned; set QEMU_VMALLOC_ALIGN accordingly. Signed-off-by: Peter Maydellpeter.mayd...@linaro.org --- oslib-posix.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oslib-posix.c b/oslib-posix.c index 9db9c3d..d25b52a 100644 --- a/oslib-posix.c +++ b/oslib-posix.c @@ -35,7 +35,7 @@ extern int daemon(int, int); #endif -#if defined(__linux__) defined(__x86_64__) +#if defined(__linux__) (defined(__x86_64__) || defined(__arm__)) Why not just drop the arch specific bit? other archs have other alignment requirements, iirc. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 0/4] AER-KVM: Error containment of PCI pass-thru devices assigned to KVM guests
-Original Message- From: Stefan Hajnoczi [mailto:stefa...@gmail.com] Sent: Tuesday, November 20, 2012 5:41 AM To: Pandarathil, Vijaymohan R Cc: kvm@vger.kernel.org; linux-...@vger.kernel.org; qemu-de...@nongnu.org; linux-ker...@vger.kernel.org Subject: Re: [PATCH 0/4] AER-KVM: Error containment of PCI pass-thru devices assigned to KVM guests On Tue, Nov 20, 2012 at 06:31:48AM +, Pandarathil, Vijaymohan R wrote: Add support for error containment when a PCI pass-thru device assigned to a KVM guest encounters an error. This is for PCIe devices/drivers that support AER functionality. When the OS is notified of an error in a device either through the firmware first approach or through an interrupt handled by the AER root port driver, concerned subsystems are notified by invoking callbacks registered by these subsystems. The device is also marked as tainted till the corresponding driver recovery routines are successful. KVM module registers for a notification of such errors. In the KVM callback routine, a global counter is incremented to keep track of the error notification. Before each CPU enters guest mode to execute guest code, appropriate checks are done to see if the impacted device belongs to the guest or not. If the device belongs to the guest, qemu hypervisor for the guest is informed and the guest is immediately brought down, thus preventing or minimizing chances of any bad data being written out by the guest driver after the device has encountered an error. I'm surprised that the hypervisor would shut down the guest when PCIe AER kicks in for a pass-through device. Shouldn't we pass the AER event into the guest and deal with it there? Agreed. That would be the ideal behavior and is planned in a future patch. Lack of control over the capabilities/type of the OS/drivers running in the guest is also a concern in passing along the event to the guest. My understanding is that in the current implementation of Linux/KVM, these errors are not handled at all and can potentially cause a guest hang or crash or even data corruption depending on the implementation of the guest driver for the device. As a first step, these patches make the behavior better by doing error containment with a predictable behavior when such errors occur. The equivalent to this policy on physical hardware would be that the CPU is reset or the machine is powered down on AER. That doesn't sound right. Stefan -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [kvmarm] [RFC v4 8/8] oslib-posix: Align to permit transparent hugepages on ARM Linux
On 11/20/2012 02:55 PM, Christoffer Dall wrote: On Tue, Nov 20, 2012 at 8:54 AM, Alexander Grafag...@suse.de wrote: On 11/20/2012 02:31 PM, Peter Maydell wrote: ARM Linux (like x86-64 Linux) can use transparent hugepages for KVM if memory blocks are 2MiB aligned; set QEMU_VMALLOC_ALIGN accordingly. Signed-off-by: Peter Maydellpeter.mayd...@linaro.org --- oslib-posix.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oslib-posix.c b/oslib-posix.c index 9db9c3d..d25b52a 100644 --- a/oslib-posix.c +++ b/oslib-posix.c @@ -35,7 +35,7 @@ extern int daemon(int, int); #endif -#if defined(__linux__) defined(__x86_64__) +#if defined(__linux__) (defined(__x86_64__) || defined(__arm__)) Why not just drop the arch specific bit? other archs have other alignment requirements, iirc. Ah, sorry, missed the rest of the lines around this one :). Yeah, should be ok then. Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [kvmarm] [RFC v4 8/8] oslib-posix: Align to permit transparent hugepages on ARM Linux
On 20 November 2012 14:37, Alexander Graf ag...@suse.de wrote: On 11/20/2012 02:55 PM, Christoffer Dall wrote: On Tue, Nov 20, 2012 at 8:54 AM, Alexander Grafag...@suse.de wrote: On 11/20/2012 02:31 PM, Peter Maydell wrote: ARM Linux (like x86-64 Linux) can use transparent hugepages for KVM if memory blocks are 2MiB aligned; set QEMU_VMALLOC_ALIGN accordingly. -#if defined(__linux__) defined(__x86_64__) +#if defined(__linux__) (defined(__x86_64__) || defined(__arm__)) Why not just drop the arch specific bit? other archs have other alignment requirements, iirc. Ah, sorry, missed the rest of the lines around this one :). Yeah, should be ok then. Yeah. Ideally the kernel would provide a mechanism so we can ask at runtime what the preferred alignment is. (Or it could just automatically provide it for suitably large allocations.) In the absence of that this is just following along with the current style. -- PMM -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM call agenda for 2012-11-20
Juan Quintela quint...@redhat.com wrote: Hi Please send in any agenda topics you are interested in. As there are no topic's call is cancelled. happy hacking, Juan. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
messed up with xml-files and configuration of a VM
Hi, first, i'm new to kvm. I'm running KVM on a sles 11 sp2, kernel 3.0.13-0.27-default. My guest is an Ubuntu 12.0.4 LTS 64bit. The guest has attached a CDROM, using an iso-file from a CIFS-Share. I detached it with the virtual machine manager (0.9.0). I don't see the cd-rom anymore in the virtual machine manager. But when i try to start the vm, it complains about the missing iso-file. Why ? I detached it. When i like to have a look in the xml-files of the guest, i found three ! One in /var/lib/kvm/images, one in /etc/libvirt/qemu and one in /etc/kvm/vm. Which one should i use to configure the vm ? In the one in /etc/libvirt/qemu the cifs-share isn't mentioned any longer, in the other two it is still. Is it possible to configure the vm editing one of the XML-files ? Or shall i use virsh ? Using virsh, does the vm has to be stopped or can i edit the configuration for a running vm ? Why three xml-files ? Why is detaching with the virtual machine manager not working ? Thanks for any inspiration. Bernd -- Bernd Lentes Systemadministration Institut für Entwicklungsgenetik Gebäude 35.34 - Raum 208 HelmholtzZentrum münchen bernd.len...@helmholtz-muenchen.de phone: +49 89 3187 1241 fax: +49 89 3187 2294 http://www.helmholtz-muenchen.de/idg Wir sollten nicht den Tod fürchten, sondern das schlechte Leben Helmholtz Zentrum München Deutsches Forschungszentrum für Gesundheit und Umwelt (GmbH) Ingolstädter Landstr. 1 85764 Neuherberg www.helmholtz-muenchen.de Aufsichtsratsvorsitzende: MinDir´in Bärbel Brumme-Bothe Geschäftsführer: Prof. Dr. Günther Wess und Dr. Nikolaus Blum Registergericht: Amtsgericht München HRB 6466 USt-IdNr: DE 129521671 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 13/16] kvm tools: keep track of registered memory banks in struct kvm
Hi Sasha, On Tue, Nov 13, 2012 at 04:37:38AM +, Sasha Levin wrote: On 11/12/2012 06:57 AM, Will Deacon wrote: struct kvm { struct kvm_arch arch; struct kvm_config cfg; @@ -49,6 +56,7 @@ struct kvm { u64 ram_size; void*ram_start; u64 ram_pagesize; + struct list_headmem_banks; These memory banks actually look like a perfect example to use our augmented interval rb-tree, can we switch them to use it, or is it a list on purpose? I found some time to look at this today but unfortunately they're not as ideally suited to the interval tree as they look: the problem being that we need to search for banks by both host virtual address *and* guest physical address depending on the translation that we're doing. We could have two separate tress, but that seems like overkill given the likely number of banks. Will -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] vfio powerpc: enabled and supported on powernv platform
On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote: VFIO implements platform independent stuff such as a PCI driver, BAR access (via read/write on a file descriptor or direct mapping when possible) and IRQ signaling. The platform dependent part includes IOMMU initialization and handling. This patch initializes IOMMU groups based on the IOMMU configuration discovered during the PCI scan, only POWERNV platform is supported at the moment. Also the patch implements an VFIO-IOMMU driver which manages DMA mapping/unmapping requests coming from the client (now QEMU). It also returns a DMA window information to let the guest initialize the device tree for a guest OS properly. Although this driver has been tested only on POWERNV, it should work on any platform supporting TCE tables. To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option. Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/iommu.h |6 + arch/powerpc/kernel/iommu.c | 140 +++ arch/powerpc/platforms/powernv/pci.c | 135 +++ drivers/iommu/Kconfig|8 ++ drivers/vfio/Kconfig |6 + drivers/vfio/Makefile|1 + drivers/vfio/vfio_iommu_spapr_tce.c | 247 ++ include/linux/vfio.h | 20 +++ 8 files changed, 563 insertions(+) create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index cbfe678..5ba66cb 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -64,30 +64,33 @@ struct iommu_pool { } cacheline_aligned_in_smp; struct iommu_table { unsigned long it_busno; /* Bus number this table belongs to */ unsigned long it_size; /* Size of iommu table in entries */ unsigned long it_offset;/* Offset into global table */ unsigned long it_base; /* mapped address of tce table */ unsigned long it_index; /* which iommu table this is */ unsigned long it_type; /* type: PCI or Virtual Bus */ unsigned long it_blocksize; /* Entries in each block (cacheline) */ unsigned long poolsize; unsigned long nr_pools; struct iommu_pool large_pool; struct iommu_pool pools[IOMMU_NR_POOLS]; unsigned long *it_map; /* A simple allocation bitmap for now */ +#ifdef CONFIG_IOMMU_API + struct iommu_group *it_group; +#endif }; struct scatterlist; static inline void set_iommu_table_base(struct device *dev, void *base) { dev-archdata.dma_data.iommu_table_base = base; } static inline void *get_iommu_table_base(struct device *dev) { return dev-archdata.dma_data.iommu_table_base; } /* Frees table for an individual device node */ @@ -135,17 +138,20 @@ static inline void pci_iommu_init(void) { } extern void alloc_dart_table(void); #if defined(CONFIG_PPC64) defined(CONFIG_PM) static inline void iommu_save(void) { if (ppc_md.iommu_save) ppc_md.iommu_save(); } static inline void iommu_restore(void) { if (ppc_md.iommu_restore) ppc_md.iommu_restore(); } #endif +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, uint64_t tce, + enum dma_data_direction direction, unsigned long pages); + #endif /* __KERNEL__ */ #endif /* _ASM_IOMMU_H */ diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index ff5a6ce..94f614b 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -32,30 +32,31 @@ #include linux/dma-mapping.h #include linux/bitmap.h #include linux/iommu-helper.h #include linux/crash_dump.h #include linux/hash.h #include linux/fault-inject.h #include linux/pci.h #include asm/io.h #include asm/prom.h #include asm/iommu.h #include asm/pci-bridge.h #include asm/machdep.h #include asm/kdump.h #include asm/fadump.h #include asm/vio.h +#include asm/tce.h #define DBG(...) static int novmerge; static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int); static int __init setup_iommu(char *str) { if (!strcmp(str, novmerge)) novmerge = 1; else if (!strcmp(str, vmerge)) novmerge = 0; return 1; } @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl, } void iommu_free_coherent(struct iommu_table *tbl, size_t size, void *vaddr, dma_addr_t dma_handle) { if (tbl) { unsigned int nio_pages; size = PAGE_ALIGN(size); nio_pages = size IOMMU_PAGE_SHIFT; iommu_free(tbl, dma_handle, nio_pages); size =
Re: [PATCH 1/3] KVM: x86: clean up reexecute_instruction
On 11/20/2012 08:11 PM, Gleb Natapov wrote: On Tue, Nov 20, 2012 at 07:58:32AM +0800, Xiao Guangrong wrote: Little cleanup for reexecute_instruction, also use gpa_to_gfn in retry_instruction Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/x86.c | 13 ++--- 1 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 52ae8b5..7be8452 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4477,19 +4477,18 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) if (tdp_enabled) return false; +gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); +if (gpa == UNMAPPED_GVA) +return true; /* let cpu generate fault */ + /* * if emulation was due to access to shadowed page table * and it failed try to unshadow page and re-enter the * guest to let CPU execute the instruction. */ -if (kvm_mmu_unprotect_page_virt(vcpu, gva)) +if (kvm_mmu_unprotect_page(vcpu-kvm, c(gpa))) What's c()? Should be gpa_to_gfn(gpa)? Yes. It is the stupid copy-paste error. Thanks you for pointing it out, Gleb! This is the new one have fixed it. Subject: [PATCH 1/3] KVM: x86: clean up reexecute_instruction Little cleanup for reexecute_instruction, also use gpa_to_gfn in retry_instruction Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/x86.c | 13 ++--- 1 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 52ae8b5..7be8452 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4477,19 +4477,18 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) if (tdp_enabled) return false; + gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); + if (gpa == UNMAPPED_GVA) + return true; /* let cpu generate fault */ + /* * if emulation was due to access to shadowed page table * and it failed try to unshadow page and re-enter the * guest to let CPU execute the instruction. */ - if (kvm_mmu_unprotect_page_virt(vcpu, gva)) + if (kvm_mmu_unprotect_page(vcpu-kvm, gpa_to_gfn(gpa))) return true; - gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL); - - if (gpa == UNMAPPED_GVA) - return true; /* let cpu generate fault */ - /* * Do not retry the unhandleable instruction if it faults on the * readonly host memory, otherwise it will goto a infinite loop: @@ -4544,7 +4543,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, if (!vcpu-arch.mmu.direct_map) gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); - kvm_mmu_unprotect_page(vcpu-kvm, gpa PAGE_SHIFT); + kvm_mmu_unprotect_page(vcpu-kvm, gpa_to_gfn(gpa)); return true; } -- 1.7.7.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 13/16] kvm tools: keep track of registered memory banks in struct kvm
On 11/20/2012 12:15 PM, Will Deacon wrote: Hi Sasha, On Tue, Nov 13, 2012 at 04:37:38AM +, Sasha Levin wrote: On 11/12/2012 06:57 AM, Will Deacon wrote: struct kvm { struct kvm_arch arch; struct kvm_config cfg; @@ -49,6 +56,7 @@ struct kvm { u64 ram_size; void*ram_start; u64 ram_pagesize; + struct list_headmem_banks; These memory banks actually look like a perfect example to use our augmented interval rb-tree, can we switch them to use it, or is it a list on purpose? I found some time to look at this today but unfortunately they're not as ideally suited to the interval tree as they look: the problem being that we need to search for banks by both host virtual address *and* guest physical address depending on the translation that we're doing. We could have two separate tress, but that seems like overkill given the likely number of banks. Makes sense. We can convert it later if we need to as well. Thanks, Sasha -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v11] kvm: notify host when the guest is panicked
On Tue, Nov 20, 2012 at 06:09:48PM +0800, Hu Tao wrote: Hi Marcelo, On Tue, Nov 13, 2012 at 12:19:08AM -0200, Marcelo Tosatti wrote: On Fri, Nov 09, 2012 at 03:17:39PM -0500, Sasha Levin wrote: On Mon, Nov 5, 2012 at 8:58 PM, Hu Tao hu...@cn.fujitsu.com wrote: But in the case of panic notification, more dependency means more chances of failure of panic notification. Say, if we use a virtio device to do panic notification, then we will fail if: virtio itself has problems, virtio for some reason can't be deployed(neither built-in or as a module), or guest doesn't support virtio, etc. Add polling to your virtio device. If it didn't notify of a panic but taking more than 20 sec to answer your poll request you can assume it's dead. Actually, just use virtio-serial and something in userspace on the guest. They want the guest to stop, so a memory dump can be taken by management interface. Hu Tao, lets assume port I/O is the preferred method for communication. Okey. Now, the following comments have still not been addressed: 1) Lifecycle of the stopped guest and interaction with other stopped states in QEMU. Patch 3 already deals with run state transitions. But in case I'm missing something, could you be more specific? - What are the possibilities during migration? Say: - migration starts. - guest panics. - migration starts vm on other side? - Guest stopped due to EIO. - guest vcpuN panics, VMEXIT but still outside QEMU. - QEMU EIO error, stop vm. - guest vcpuN completes, processes IO exit. - system_reset due to panic. - Add all possibilities that should be verified (that is, interaction of this feature with other stopped states in QEMU). --- - What happens if the guest has reboot-on-panic configured? Does it take precedence over hypervisor notification? Out of curiosity, does kexec support memory dumping? 2) Format of the interface for other architectures (you can choose a different KVM supported architecture and write an example). 3) Clear/documented management interface for the feature. It is documented in patch 0: Documentation/virtual/kvm/pv_event.txt. Does it need to be improved? This is documentation for the host-guest interface. There is no documentation on the interface for management. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v5] kvm/fpu: Enable fully eager restore kvm FPU
On Wed, Nov 07, 2012 at 10:01:11AM +0800, Xudong Hao wrote: Romove fpu lazy restore logic, using eager restore totally. v5 changes from v4: - remove lazy fpu restore totally, fpu eager restore does not have performance regression and simple the code. v4 changes from v3: - Wrap up some confused code with a clear function lazy_fpu_allowed() - Update fpu while update cr4 too. v3 changes from v2: - Make fpu active explicitly while guest xsave is enabling and non-lazy xstate bit exist. v2 changes from v1: - Expand KVM_XSTATE_LAZY to 64 bits before negating it. Signed-off-by: Xudong Hao xudong@intel.com --- arch/x86/kvm/vmx.c | 9 ++--- arch/x86/kvm/x86.c | 8 +--- include/linux/kvm_host.h | 1 - 3 files changed, 3 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6599e45..c1fd2e1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1197,7 +1197,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) u32 eb; eb = (1u PF_VECTOR) | (1u UD_VECTOR) | (1u MC_VECTOR) | - (1u NM_VECTOR) | (1u DB_VECTOR); + (1u DB_VECTOR); if ((vcpu-guest_debug (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) Please remove the code entirely, including: if (is_no_device(intr_info)) { vmx_fpu_activate(vcpu); return 1; } and clts handling. fpu_active/fpu_deactivate callbacks become unused, don't they? Also remove fpu_active variable. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 4/5] KVM: MMU: move adjusting softmmu pte access to FNAME(page_fault)
On Mon, Nov 05, 2012 at 08:12:07PM +0800, Xiao Guangrong wrote: Then, no mmu specified code exists in the common function and drop two parameters in set_spte Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com arch/x86/kvm/mmu.c | 42 +++--- arch/x86/kvm/paging_tmpl.h | 25 - 2 files changed, 31 insertions(+), 36 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 49957df..4229e78 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2351,8 +2351,7 @@ static bool gfn_need_write_protect(struct kvm_vcpu *vcpu, u64 *sptep, /* The return value indicates whether the @gfn need to be write protected. */ static bool vcpu_adjust_access(struct kvm_vcpu *vcpu, u64 *sptep, -unsigned *pte_access, int user_fault, -int write_fault, int level, gfn_t gfn, +unsigned *pte_access, int level, gfn_t gfn, bool can_unsync, bool host_writable) { bool ret = false; @@ -2361,21 +2360,6 @@ static bool vcpu_adjust_access(struct kvm_vcpu *vcpu, u64 *sptep, if (!host_writable) access = ~ACC_WRITE_MASK; - if (!(access ACC_WRITE_MASK) (!vcpu-arch.mmu.direct_map - write_fault !is_write_protection(vcpu) !user_fault)) { - access |= ACC_WRITE_MASK; - access = ~ACC_USER_MASK; - - /* - * If we converted a user page to a kernel page, - * so that the kernel can write to it when cr0.wp=0, - * then we should prevent the kernel from executing it - * if SMEP is enabled. - */ - if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) - access = ~ACC_EXEC_MASK; - } - if ((access ACC_WRITE_MASK) gfn_need_write_protect(vcpu, sptep, level, gfn, can_unsync)) { access = ~ACC_WRITE_MASK; @@ -2387,8 +2371,7 @@ static bool vcpu_adjust_access(struct kvm_vcpu *vcpu, u64 *sptep, } static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned pte_access, int user_fault, - int write_fault, int level, + unsigned pte_access, int level, gfn_t gfn, pfn_t pfn, bool speculative, bool can_unsync, bool host_writable) { @@ -2398,8 +2381,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (set_mmio_spte(sptep, gfn, pfn, pte_access)) return 0; - ret = vcpu_adjust_access(vcpu, sptep, pte_access, user_fault, - write_fault, level, gfn, can_unsync, host_writable); + ret = vcpu_adjust_access(vcpu, sptep, pte_access, level, gfn, + can_unsync, host_writable); spte = PT_PRESENT_MASK; if (!speculative) @@ -2440,17 +2423,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pt_access, unsigned pte_access, - int user_fault, int write_fault, - int *emulate, int level, gfn_t gfn, - pfn_t pfn, bool speculative, + int write_fault, int *emulate, int level, + gfn_t gfn, pfn_t pfn, bool speculative, bool host_writable) { bool was_rmapped = false; - pgprintk(%s: spte %llx access %x write_fault %d - user_fault %d gfn %llx\n, - __func__, *sptep, pt_access, - write_fault, user_fault, gfn); + pgprintk(%s: spte %llx access %x write_fault %d gfn %llx\n, + __func__, *sptep, pt_access, write_fault, gfn); if (is_rmap_spte(*sptep)) { if (pfn != spte_to_pfn(*sptep)) { @@ -2462,7 +2442,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, was_rmapped = true; } - if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, + if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative, true, host_writable)) { if (write_fault) @@ -2556,7 +2536,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, for (i = 0; i ret; i++, gfn++, start++) mmu_set_spte(vcpu, start, ACC_ALL, - access, 0, 0, NULL, + access, 0, NULL, sp-role.level, gfn, page_to_pfn(pages[i]), true, true); @@ -2620,7 +2600,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, unsigned pte_access = ACC_ALL; mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, -
Re: [PATCH 2/5] KVM: MMU: simplify mmu_set_spte
On Tue, Nov 13, 2012 at 04:39:44PM +0800, Xiao Guangrong wrote: On 11/13/2012 07:12 AM, Marcelo Tosatti wrote: On Mon, Nov 05, 2012 at 08:10:08PM +0800, Xiao Guangrong wrote: In order to detecting spte remapping, we can simply check whether the spte has already been pointing to the pfn even if the spte is not the last spte for middle spte is pointing to the kernel pfn which can not be mapped to userspace Also, update slot and stat.lpages iff the spte is not remapped Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/mmu.c | 40 +--- 1 files changed, 13 insertions(+), 27 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 692ebb1..4ea731e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2420,8 +2420,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, pfn_t pfn, bool speculative, bool host_writable) { - int was_rmapped = 0; - int rmap_count; + bool was_rmapped = false; pgprintk(%s: spte %llx access %x write_fault %d user_fault %d gfn %llx\n, @@ -2429,25 +2428,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, write_fault, user_fault, gfn); if (is_rmap_spte(*sptep)) { - /* - * If we overwrite a PTE page pointer with a 2MB PMD, unlink - * the parent of the now unreachable PTE. - */ - if (level PT_PAGE_TABLE_LEVEL - !is_large_pte(*sptep)) { - struct kvm_mmu_page *child; - u64 pte = *sptep; + if (pfn != spte_to_pfn(*sptep)) { + struct kvm_mmu_page *sp = page_header(__pa(sptep)); - child = page_header(pte PT64_BASE_ADDR_MASK); - drop_parent_pte(child, sptep); - kvm_flush_remote_tlbs(vcpu-kvm); How come its safe to drop this case? We use if (pfn != spte_to_pfn(*sptep)) to simplify the thing. There are two cases: 1) the sptep is not the last mapping. under this case, sptep must point to a shadow page table, that means spte_to_pfn(*sptep)) is used by KVM module, and 'pfn' is used by userspace. so, 'if' condition must be satisfied, the sptep will be dropped. Actually, This is the origin case: | if (level PT_PAGE_TABLE_LEVEL | !is_large_pte(*sptep)) 2) the sptep is the last mapping. under this case, the level of spte (sp.level) must equal the 'level' which we pass to mmu_set_spte. If they point to the same pfn, it is 'remap', otherwise we drop it. I think this is safe. :) mmu_page_zap_pte takes care of it, OK. What if was_rmapped=true but gfn is different? Say if the spte comes from an unsync shadow page, the guest modifies that shadow page (but does not invalidate it with invlpg), then faults. gfn can still point to the same gfn (but in that case, with your patch, page_header_update_slot is not called. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/5] KVM: MMU: simplify set_spte
On Mon, Nov 05, 2012 at 08:11:03PM +0800, Xiao Guangrong wrote: It is more cleaner if we can update pte_access fist then set spte according to pte_access, also introduce gfn_need_write_protect to check whether the gfn need to be write-protected Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com Please separate patch in: - code movement with no logical modification. - logical modification (such as condition for mark_page_dirty). - move code to helper functions. arch/x86/kvm/mmu.c | 109 1 files changed, 67 insertions(+), 42 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4ea731e..49957df 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2329,6 +2329,63 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, return 0; } +static bool gfn_need_write_protect(struct kvm_vcpu *vcpu, u64 *sptep, +int level, gfn_t gfn, bool can_unsync) +{ + /* + * Optimization: for pte sync, if spte was writable the hash + * lookup is unnecessary (and expensive). Write protection + * is responsibility of mmu_get_page / kvm_sync_page. + * Same reasoning can be applied to dirty page accounting. + */ + if (!can_unsync is_writable_pte(*sptep)) + return false; + + if ((level PT_PAGE_TABLE_LEVEL +has_wrprotected_page(vcpu-kvm, gfn, level)) || + mmu_need_write_protect(vcpu, gfn, can_unsync)) + return true; + + return false; +} + +/* The return value indicates whether the @gfn need to be write protected. */ +static bool vcpu_adjust_access(struct kvm_vcpu *vcpu, u64 *sptep, +unsigned *pte_access, int user_fault, +int write_fault, int level, gfn_t gfn, +bool can_unsync, bool host_writable) +{ + bool ret = false; + unsigned access = *pte_access; + + if (!host_writable) + access = ~ACC_WRITE_MASK; + + if (!(access ACC_WRITE_MASK) (!vcpu-arch.mmu.direct_map + write_fault !is_write_protection(vcpu) !user_fault)) { + access |= ACC_WRITE_MASK; + access = ~ACC_USER_MASK; + + /* + * If we converted a user page to a kernel page, + * so that the kernel can write to it when cr0.wp=0, + * then we should prevent the kernel from executing it + * if SMEP is enabled. + */ + if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) + access = ~ACC_EXEC_MASK; + } + + if ((access ACC_WRITE_MASK) + gfn_need_write_protect(vcpu, sptep, level, gfn, can_unsync)) { + access = ~ACC_WRITE_MASK; + ret = true; + } + + *pte_access = access; + return ret; +} + static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, int user_fault, int write_fault, int level, @@ -2341,6 +2398,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (set_mmio_spte(sptep, gfn, pfn, pte_access)) return 0; + ret = vcpu_adjust_access(vcpu, sptep, pte_access, user_fault, + write_fault, level, gfn, can_unsync, host_writable); + spte = PT_PRESENT_MASK; if (!speculative) spte |= shadow_accessed_mask; @@ -2353,61 +2413,26 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (pte_access ACC_USER_MASK) spte |= shadow_user_mask; + if (pte_access ACC_WRITE_MASK) { + spte |= PT_WRITABLE_MASK; + spte |= SPTE_MMU_WRITEABLE; + } + if (level PT_PAGE_TABLE_LEVEL) spte |= PT_PAGE_SIZE_MASK; + if (tdp_enabled) spte |= kvm_x86_ops-get_mt_mask(vcpu, gfn, kvm_is_mmio_pfn(pfn)); if (host_writable) spte |= SPTE_HOST_WRITEABLE; - else - pte_access = ~ACC_WRITE_MASK; spte |= (u64)pfn PAGE_SHIFT; - if ((pte_access ACC_WRITE_MASK) - || (!vcpu-arch.mmu.direct_map write_fault - !is_write_protection(vcpu) !user_fault)) { - spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; - - if (!vcpu-arch.mmu.direct_map - !(pte_access ACC_WRITE_MASK)) { - spte = ~PT_USER_MASK; - /* - * If we converted a user page to a kernel page, - * so that the kernel can write to it when cr0.wp=0, - * then we should prevent the kernel from executing it - * if SMEP is enabled. - */ - if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) -
Re: Re: Re: [RFC PATCH 0/2] kvm/vmx: Output TSC offset
On Tue, Nov 20, 2012 at 07:36:33PM +0900, Yoshihiro YUNOMAE wrote: Hi Marcelo, Sorry for the late reply. (2012/11/17 4:15), Marcelo Tosatti wrote: On Wed, Nov 14, 2012 at 05:26:10PM +0900, Yoshihiro YUNOMAE wrote: Thank you for commenting on my patch set. (2012/11/14 11:31), Steven Rostedt wrote: On Tue, 2012-11-13 at 18:03 -0800, David Sharp wrote: On Tue, Nov 13, 2012 at 6:00 PM, Steven Rostedt rost...@goodmis.org wrote: On Wed, 2012-11-14 at 10:36 +0900, Yoshihiro YUNOMAE wrote: To merge the data like previous pattern, we apply this patch set. Then, we can get TSC offset of the guest as follows: $ dmesg | grep kvm [ 57.717180] kvm: (2687) write TSC offset 18446743360465545001, now clock ## | PID TSC offset | HOST TSC value --+ Using printk to export something like this is IMO a nasty hack. Can't we create a /sys or /proc file to export the same thing? Since the value changes over the course of the trace, and seems to be part of the context of the trace, I think I'd include it as a tracepoint. I'm fine with that too. Using some tracepoint is a nice idea, but there is one problem. Here, our discussion point is the event which TSC offset is changed does not frequently occur, but the buffer must keep the event data. There are two ideas for using tracepoint. First, we define new tracepoint for changed TSC offset. This is simple and the overhead will be low. However, this trace event stored in the buffer will be overwritten by other trace events because this TSC offset event does not frequently occur. Second, we add TSC offset information to the tracepoint frequently occured. For example, we assume that TSC offset information is added to arguments of trace_kvm_exit(). The TSC offset is in the host trace. So given a host trace with two TSC offset updates, how do you know which events in the guest trace (containing a number of events) refer to which tsc offset update? Unless i am missing something, you can't solve this easily (well, except exporting information to the guest that allows it to transform RDTSC - host TSC value, which can be done via pvclock). As you say, TSC offset events are in the host trace, but we don't need to notify guests of updating TSC offset. The offset event will output the next TSC offset value and the current TSC value, so we can calculate the guest TSC (T1) for the event. Guest TSCs since T1 can be converted to host TSC using the TSC offset, so we can integrate those trace data. Think of this scenario: host trace 1h. event tsc write tsc_offset=-1000 3h. vmenter 4h. vmexit ... (event sequence) 99h. vmexit 100h. event tsc_write tsc_offset=-2000 101h. vmenter ... (event sequence). 500h. event tsc_write tsc_offset=-3000 Then a guest trace containing events with a TSC timestamp. Which tsc_offset to use? (that is the problem, which unless i am mistaken can only be solved easily if the guest can convert RDTSC - TSC of host). Another issue as mentioned is lack of TSC synchronization in the host. Should you provide such a feature without the possibility of proper chronological order on systems with unsynchronized TSC? I think, we cannot support this sorting feature using TSC on systems with unsynchronized TSC. On systems with unsynchronized TSC, it is difficult to sort not only trace data of guests and the host but trace data of a guest or a host using TSC in chronological order. Actually, if we want to output tracing data of ftrace in chronological order with unsynchronized TSC, we will use the global mode as the timestamp. The global mode uses wallclock added TSC correction, so the mode guarantees to sort in chronological order for trace data of the guest or of the host. If we use this mode to sort the trace data of guests and the host in chronological order, we need to consider about the difference between the guest and the host and timekeeping of guests and the host, so it is difficult to solve these issues. At least, I haven't came up with the good solution. I suppose the tradeoff is performance (RDTSC) versus reliability, when using ftrace. But then, even ftrace on the host suffers from the same problem, with unsynchronized TSCs. We cannot sort the trace data of guests and the host in chronological order with unsynchronized TSC, but if we can set following synchronization events for both guests and the host, we will know where we should sort. First, a guest and the host uses the global mode as the timestamp of ftrace. Next, a user on the guest writes 1 to the synchronization I/F as the ID, then the synchronization event 1 is recorded in a ring-buffer of the guest. The synchronization operation induces hypercall, so the host can handle the event. After the
Re: [PATCH 2/5] KVM: MMU: simplify mmu_set_spte
On 11/21/2012 06:18 AM, Marcelo Tosatti wrote: - child = page_header(pte PT64_BASE_ADDR_MASK); - drop_parent_pte(child, sptep); - kvm_flush_remote_tlbs(vcpu-kvm); How come its safe to drop this case? We use if (pfn != spte_to_pfn(*sptep)) to simplify the thing. There are two cases: 1) the sptep is not the last mapping. under this case, sptep must point to a shadow page table, that means spte_to_pfn(*sptep)) is used by KVM module, and 'pfn' is used by userspace. so, 'if' condition must be satisfied, the sptep will be dropped. Actually, This is the origin case: | if (level PT_PAGE_TABLE_LEVEL | !is_large_pte(*sptep)) 2) the sptep is the last mapping. under this case, the level of spte (sp.level) must equal the 'level' which we pass to mmu_set_spte. If they point to the same pfn, it is 'remap', otherwise we drop it. I think this is safe. :) mmu_page_zap_pte takes care of it, OK. What if was_rmapped=true but gfn is different? Say if the spte comes from an unsync shadow page, the guest modifies that shadow page (but does not invalidate it with invlpg), then faults. gfn can still point to the same gfn (but in that case, with your patch, page_header_update_slot is not called. Marcelo, Page fault path and other sync/prefetch paths will reread guest page table, then it get a different target pfn. The scenario is like this: gfn1 = pfn1, gfn2 = pfn2 gpte = pfn1, spte is shadowed by gpte and it is a unsync spte Guest Host spte = (gfn1, pfn1) modify gpte to let it point to gfn2 spte = (gfn1, pfn1) page-fault on gpte intercept the page-fault, then want to update spte to (gfn2, pfn2) in mmu_set_spte, we can detect pfn2 != pfn1, then drop it. Hmm, the interesting thing is what if different gfns map to the same pfn. For example, spte1 is shadowed by gfn1 and spte2 is shadowed by pfn2, both gfn1 and gfn2 map to pfn, the code (including the current code) will set spte1 to the gfn2's rmap and spte2 to the gfn1's rmap. But i think it is ok. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/5] KVM: MMU: simplify set_spte
On 11/21/2012 06:24 AM, Marcelo Tosatti wrote: On Mon, Nov 05, 2012 at 08:11:03PM +0800, Xiao Guangrong wrote: It is more cleaner if we can update pte_access fist then set spte according to pte_access, also introduce gfn_need_write_protect to check whether the gfn need to be write-protected Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com Please separate patch in: - code movement with no logical modification. - logical modification (such as condition for mark_page_dirty). - move code to helper functions. Okay, will split it. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 4/5] KVM: MMU: move adjusting softmmu pte access to FNAME(page_fault)
On 11/21/2012 06:27 AM, Marcelo Tosatti wrote: @@ -544,6 +544,21 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, return 0; } +if (write_fault !(walker.pte_access ACC_WRITE_MASK) + !is_write_protection(vcpu) !user_fault) { +walker.pte_access |= ACC_WRITE_MASK; +walker.pte_access = ~ACC_USER_MASK; + +/* + * If we converted a user page to a kernel page, + * so that the kernel can write to it when cr0.wp=0, + * then we should prevent the kernel from executing it + * if SMEP is enabled. + */ +if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) +walker.pte_access = ~ACC_EXEC_MASK; +} + What about sync_page path? The sync_page and other prefetch paths only do read-prefetch, means they call set_spte with write_fault = 0. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/5] KVM: MMU: simplify mmu_set_spte
On Wed, Nov 21, 2012 at 07:23:26AM +0800, Xiao Guangrong wrote: On 11/21/2012 06:18 AM, Marcelo Tosatti wrote: -child = page_header(pte PT64_BASE_ADDR_MASK); -drop_parent_pte(child, sptep); -kvm_flush_remote_tlbs(vcpu-kvm); How come its safe to drop this case? We use if (pfn != spte_to_pfn(*sptep)) to simplify the thing. There are two cases: 1) the sptep is not the last mapping. under this case, sptep must point to a shadow page table, that means spte_to_pfn(*sptep)) is used by KVM module, and 'pfn' is used by userspace. so, 'if' condition must be satisfied, the sptep will be dropped. Actually, This is the origin case: | if (level PT_PAGE_TABLE_LEVEL |!is_large_pte(*sptep)) 2) the sptep is the last mapping. under this case, the level of spte (sp.level) must equal the 'level' which we pass to mmu_set_spte. If they point to the same pfn, it is 'remap', otherwise we drop it. I think this is safe. :) mmu_page_zap_pte takes care of it, OK. What if was_rmapped=true but gfn is different? Say if the spte comes from an unsync shadow page, the guest modifies that shadow page (but does not invalidate it with invlpg), then faults. gfn can still point to the same gfn (but in that case, with your patch, page_header_update_slot is not called. Marcelo, Page fault path and other sync/prefetch paths will reread guest page table, then it get a different target pfn. The scenario is like this: gfn1 = pfn1, gfn2 = pfn2 gpte = pfn1, spte is shadowed by gpte and it is a unsync spte Guest Host spte = (gfn1, pfn1) modify gpte to let it point to gfn2 spte = (gfn1, pfn1) page-fault on gpte intercept the page-fault, then want to update spte to (gfn2, pfn2) in mmu_set_spte, we can detect pfn2 != pfn1, then drop it. Hmm, the interesting thing is what if different gfns map to the same pfn. For example, spte1 is shadowed by gfn1 and spte2 is shadowed by pfn2, both gfn1 and gfn2 map to pfn, the code (including the current code) will set spte1 to the gfn2's rmap and spte2 to the gfn1's rmap. But i think it is ok. Current code updates gfn properly in set_spte by page_header_update_slot. Better keep state properly. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Interrupt controller updates
Hi Jan ! David (CC) want to make some progress with our in-kernel PIC. From memory, one of the outcomes of the BOF was that we need to move the existing enable in-kernel PIC from generic KVM init to machine init in order to be able to add an argument indicating the model use by the arch/platform since some like ours support several different models and since that all needs to be selected before the VCPUs are created. Again, from memory, you were volunteered to do the initial x86 change so we could piggy back on it :-) Or do I remember wrong ? Cheers, Ben. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: Guest performance is reduced after live migration
Dear all I continue watching a mailing list whether a similar problem is reported because a problem does not seem to happen to others. Any information, however small, would be appreciated. regards -Original Message- From: Uehara, Shouta (shouta.ueh...@jp.yokogawa.com) Sent: Friday, November 09, 2012 6:52 PM To: 'kvm@vger.kernel.org'; 'Xiao Guangrong (xiaoguangr...@linux.vnet.ibm.com)' Subject: RE: Guest performance is reduced after live migration I've analysed the problem with migration using perf-events, and it is confirmed cost of glibc is significantly increased. I made a simple test code to execute read system call on the guest as follows, and examined the performance from host and guest. fd = open(/dev/zero, O_RDONLY); while (1) { read(fd ch, 1); } --- [Source host] $ sudo perf kvm --host --guest record -a -o src_host.perf sleep 10 Events: 10K cycles 89.36% qemu-system-x86 [unknown] [g] 0x810a36ef 9.04% qemu-system-x86 [unknown] [u] 0x3fd20e2020 0.16% swapper [kernel.kallsyms] [k] intel_idle 0.11%sleep [kernel.kallsyms] [k] page_fault [Guest on source host] $ perf record -o src_guest.perf ./loop_read Events: 29K cpu-clock 11.71% loop_read [kernel.kallsyms] [k] system_call_after_swapgs 9.58% loop_read libc-2.14.90.so[.] __GI___libc_read 6.92% loop_read [kernel.kallsyms] [k] vfs_read 5.53% loop_read [kernel.kallsyms] [k] fsnotify __GI___libc_read :003fd20e2010 __read: 2.31 : 3fd20e2010: 83 3d 1d 22 2d 00 00cmpl $0x0,0x2d221d( 5.67 : 3fd20e2017: 75 10 jne 3fd20e2029 __ : :003fd20e2019 __read_nocancel: 1.82 : 3fd20e2019: b8 00 00 00 00 mov $0x0,%eax 0.00 : 3fd20e201e: 0f 05 syscall 87.78 : 3fd20e2020: 48 3d 01 f0 ff ff cmp $0xfff 0.00 : 3fd20e2026: 73 31 jae 3fd20e2059 __ 2.42 : 3fd20e2028: c3 retq [Destination host] $ sudo perf kvm --host --guest record -a -o dst_host.perf sleep 10 Events: 10K cycles 58.39% qemu-system-x86 [unknown] [g] 0x810a3a6e 40.14% qemu-system-x86 [unknown] [u] 0x3fd20e2017 0.13% gnome-shell nouveau_dri.so [.] 0xbd7c9 0.11% swapper [kernel.kallsyms] [k] intel_idle [Guest on destination host] $ perf record -o dst_guest.perf ./loop_read Events: 29K cpu-clock 41.95% loop_read libc-2.14.90.so[.] __GI___libc_read 7.90% loop_read [kernel.kallsyms] [k] system_call_after_swapgs 4.61% loop_read [kernel.kallsyms] [k] vfs_read 3.72% loop_read [kernel.kallsyms] [k] fsnotify __GI___libc_read :003fd20e2010 __read: 0.41 : 3fd20e2010: 83 3d 1d 22 2d 00 00cmpl $0x0,0x2d221d( 86.10 : 3fd20e2017: 75 10 jne 3fd20e2029 __ : :003fd20e2019 __read_nocancel: 0.33 : 3fd20e2019: b8 00 00 00 00 mov $0x0,%eax 0.00 : 3fd20e201e: 0f 05 syscall 12.84 : 3fd20e2020: 48 3d 01 f0 ff ff cmp $0xfff 0.00 : 3fd20e2026: 73 31 jae 3fd20e2059 __ 0.33 : 3fd20e2028: c3 retq --- After the migration, jne instruction in __read became to take a long time. This is due to increase the overhead of read system call. The performance was not improved even if thp/hugetlb was disabled. Does this problem happen only to me? Should I also ask other community about this problem? Shota -Original Message- From: Xiao Guangrong [mailto:xiaoguangr...@linux.vnet.ibm.com] Sent: Thursday, November 01, 2012 1:45 PM To: Uehara, Shouta (shouta.ueh...@jp.yokogawa.com) Cc: kvm@vger.kernel.org Subject: Re: Guest performance is reduced after live migration Shouta, Can it be reproduced if thp/hugetlbfs is disabled on both source and destination? On 11/01/2012 08:12 AM, shouta.ueh...@jp.yokogawa.com wrote: Hello. I have a problem with the performance of the guest Linux after live migration. When I analyze the file I/O latency of the guest using LMbench3, the latency of the guest on the destination host is about 2 times bigger than the guest on the source host. As a result that I investigated it, this problem occurs if three following conditions are right. 1. Use the kernel version 2.6.38.6-26.fc15.x86_64 or later. 2. Execute system calls for low level file I/O (read, write, open etc.) on the guest. 3. Enable EPT. Performance cannot decrease on other tests of
[PATCH v5 0/2] x86: vmclear vmcss on all cpus when doing kdump if necessary
Currently, kdump just makes all the logical processors leave VMX operation by executing VMXOFF instruction, so any VMCSs active on the logical processors may be corrupted. But, sometimes, we need the VMCSs to debug guest images contained in the host vmcore. To prevent the corruption, we should VMCLEAR the VMCSs before executing the VMXOFF instruction. The patch set provides a way to VMCLEAR vmcss related to guests on all cpus before executing the VMXOFF when doing kdump. This is used to ensure the VMCSs in the vmcore updated and non-corrupted. Changelog from v4 to v5: 1. use an atomic notifier instead of function call, so have all the vmclear codes in vmx.c. Changelog from v3 to v4: 1. add a new percpu variable vmclear_skipped to skip vmclear in kdump in some conditions. Changelog from v2 to v3: 1. remove unnecessary conditions in function cpu_emergency_clear_loaded_vmcss as Marcelo suggested. Changelog from v1 to v2: 1. remove the sysctl and clear VMCSs unconditionally. Zhang Yanfei (2): x86/kexec: VMCLEAR vmcss on all cpus if necessary KVM-INTEL: add a notifier and a bitmap to support VMCLEAR in kdump arch/x86/include/asm/kexec.h |2 + arch/x86/kernel/crash.c | 25 arch/x86/kvm/vmx.c | 85 ++ 3 files changed, 112 insertions(+), 0 deletions(-) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 1/2] x86/kexec: VMCLEAR vmcss on all cpus if necessary
This patch adds an atomic notifier list named crash_notifier_list. When loading kvm-intel module, a notifier will be registered in the list to enable vmcss loaded on all cpus to be VMCLEAR'd if needed. Signed-off-by: Zhang Yanfei zhangyan...@cn.fujitsu.com --- arch/x86/include/asm/kexec.h |2 ++ arch/x86/kernel/crash.c | 25 + 2 files changed, 27 insertions(+), 0 deletions(-) diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 317ff17..5e22b00 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -163,6 +163,8 @@ struct kimage_arch { }; #endif +extern struct atomic_notifier_head crash_notifier_list; + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_KEXEC_H */ diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 13ad899..0f3d5b4 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -16,6 +16,8 @@ #include linux/delay.h #include linux/elf.h #include linux/elfcore.h +#include linux/module.h +#include linux/notifier.h #include asm/processor.h #include asm/hardirq.h @@ -30,6 +32,19 @@ int in_crash_kexec; +/* + * The list is used to VMCLEAR vmcss loaded on all + * cpus. And when loading kvm_intel module, the + * vmclear function will be registered in the list. + */ +ATOMIC_NOTIFIER_HEAD(crash_notifier_list); +EXPORT_SYMBOL_GPL(crash_notifier_list); + +static inline void cpu_emergency_vmclear_loaded_vmcss(void) +{ + atomic_notifier_call_chain(crash_notifier_list, 0, NULL); +} + #if defined(CONFIG_SMP) defined(CONFIG_X86_LOCAL_APIC) static void kdump_nmi_callback(int cpu, struct pt_regs *regs) @@ -46,6 +61,11 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) #endif crash_save_cpu(regs, cpu); + /* +* VMCLEAR vmcss loaded on all cpus if needed. +*/ + cpu_emergency_vmclear_loaded_vmcss(); + /* Disable VMX or SVM if needed. * * We need to disable virtualization on all CPUs. @@ -88,6 +108,11 @@ void native_machine_crash_shutdown(struct pt_regs *regs) kdump_nmi_shootdown_cpus(); + /* +* VMCLEAR vmcss loaded on this cpu if needed. +*/ + cpu_emergency_vmclear_loaded_vmcss(); + /* Booting kdump kernel with VMX or SVM enabled won't work, * because (among other limitations) we can't disable paging * with the virt flags. -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 2/2] KVM-INTEL: add a notifier and a bitmap to support VMCLEAR in kdump
The notifier will be registered in crash_notifier_list when loading kvm-intel module. And the bitmap indicates whether we should do VMCLEAR operation in kdump. The bits in the bitmap are set/unset according to different conditions. Signed-off-by: Zhang Yanfei zhangyan...@cn.fujitsu.com --- arch/x86/kvm/vmx.c | 85 1 files changed, 85 insertions(+), 0 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4ff0ab9..3bbdd75 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -41,6 +41,7 @@ #include asm/i387.h #include asm/xcr.h #include asm/perf_event.h +#include asm/kexec.h #include trace.h @@ -963,6 +964,30 @@ static void vmcs_load(struct vmcs *vmcs) vmcs, phys_addr); } +#ifdef CONFIG_KEXEC +/* + * This bitmap is used to indicate whether the vmclear + * operation is enabled on all cpus. All disabled by + * default. + */ +static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; + +static inline void crash_enable_local_vmclear(int cpu) +{ + cpumask_set_cpu(cpu, crash_vmclear_enabled_bitmap); +} + +static inline void crash_disable_local_vmclear(int cpu) +{ + cpumask_clear_cpu(cpu, crash_vmclear_enabled_bitmap); +} + +static inline int crash_local_vmclear_enabled(int cpu) +{ + return cpumask_test_cpu(cpu, crash_vmclear_enabled_bitmap); +} +#endif + static void __loaded_vmcs_clear(void *arg) { struct loaded_vmcs *loaded_vmcs = arg; @@ -972,8 +997,14 @@ static void __loaded_vmcs_clear(void *arg) return; /* vcpu migration can race with cpu offline */ if (per_cpu(current_vmcs, cpu) == loaded_vmcs-vmcs) per_cpu(current_vmcs, cpu) = NULL; +#ifdef CONFIG_KEXEC + crash_disable_local_vmclear(cpu); +#endif list_del(loaded_vmcs-loaded_vmcss_on_cpu_link); loaded_vmcs_init(loaded_vmcs); +#ifdef CONFIG_KEXEC + crash_enable_local_vmclear(cpu); +#endif } static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) @@ -1491,8 +1522,14 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); local_irq_disable(); +#ifdef CONFIG_KEXEC + crash_disable_local_vmclear(cpu); +#endif list_add(vmx-loaded_vmcs-loaded_vmcss_on_cpu_link, per_cpu(loaded_vmcss_on_cpu, cpu)); +#ifdef CONFIG_KEXEC + crash_enable_local_vmclear(cpu); +#endif local_irq_enable(); /* @@ -2302,6 +2339,20 @@ static int hardware_enable(void *garbage) return -EBUSY; INIT_LIST_HEAD(per_cpu(loaded_vmcss_on_cpu, cpu)); + +#ifdef CONFIG_KEXEC + /* +* Now we can enable the vmclear operation in kdump +* since the loaded_vmcss_on_cpu list on this cpu +* has been initialized. +* +* Though the cpu is not in VMX operation now, there +* is no problem to enable the vmclear operation +* for the loaded_vmcss_on_cpu list is empty! +*/ + crash_enable_local_vmclear(cpu); +#endif + rdmsrl(MSR_IA32_FEATURE_CONTROL, old); test_bits = FEATURE_CONTROL_LOCKED; @@ -2335,6 +2386,22 @@ static void vmclear_local_loaded_vmcss(void) __loaded_vmcs_clear(v); } +#ifdef CONFIG_KEXEC +static int crash_vmclear_local_loaded_vmcss(struct notifier_block *this, + unsigned long val, void *ptr) +{ + int cpu = raw_smp_processor_id(); + + if (crash_local_vmclear_enabled(cpu)) + vmclear_local_loaded_vmcss(); + + return NOTIFY_DONE; +} + +static struct notifier_block crash_vmclear_notifier = { + .notifier_call = crash_vmclear_local_loaded_vmcss, +}; +#endif /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() * tricks. @@ -2348,6 +2415,14 @@ static void hardware_disable(void *garbage) { if (vmm_exclusive) { vmclear_local_loaded_vmcss(); +#ifdef CONFIG_KEXEC + /* +* vmclear operation in kdump should be disabled here +* because the cpu is going to exit VMX operation +* and the loaded_vmcss_on_cpu list may not be empty! +*/ + crash_disable_local_vmclear(raw_smp_processor_id()); +#endif kvm_cpu_vmxoff(); } write_cr4(read_cr4() ~X86_CR4_VMXE); @@ -7230,6 +7305,11 @@ static int __init vmx_init(void) if (r) goto out3; +#ifdef CONFIG_KEXEC + atomic_notifier_chain_register(crash_notifier_list, + crash_vmclear_notifier); +#endif + vmx_disable_intercept_for_msr(MSR_FS_BASE, false); vmx_disable_intercept_for_msr(MSR_GS_BASE, false); vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); @@ -7265,6 +7345,11 @@ static void __exit vmx_exit(void)
Re: [PATCH 2/5] KVM: MMU: simplify mmu_set_spte
On 11/21/2012 07:51 AM, Marcelo Tosatti wrote: On Wed, Nov 21, 2012 at 07:23:26AM +0800, Xiao Guangrong wrote: On 11/21/2012 06:18 AM, Marcelo Tosatti wrote: -child = page_header(pte PT64_BASE_ADDR_MASK); -drop_parent_pte(child, sptep); -kvm_flush_remote_tlbs(vcpu-kvm); How come its safe to drop this case? We use if (pfn != spte_to_pfn(*sptep)) to simplify the thing. There are two cases: 1) the sptep is not the last mapping. under this case, sptep must point to a shadow page table, that means spte_to_pfn(*sptep)) is used by KVM module, and 'pfn' is used by userspace. so, 'if' condition must be satisfied, the sptep will be dropped. Actually, This is the origin case: | if (level PT_PAGE_TABLE_LEVEL |!is_large_pte(*sptep)) 2) the sptep is the last mapping. under this case, the level of spte (sp.level) must equal the 'level' which we pass to mmu_set_spte. If they point to the same pfn, it is 'remap', otherwise we drop it. I think this is safe. :) mmu_page_zap_pte takes care of it, OK. What if was_rmapped=true but gfn is different? Say if the spte comes from an unsync shadow page, the guest modifies that shadow page (but does not invalidate it with invlpg), then faults. gfn can still point to the same gfn (but in that case, with your patch, page_header_update_slot is not called. Marcelo, Page fault path and other sync/prefetch paths will reread guest page table, then it get a different target pfn. The scenario is like this: gfn1 = pfn1, gfn2 = pfn2 gpte = pfn1, spte is shadowed by gpte and it is a unsync spte Guest Host spte = (gfn1, pfn1) modify gpte to let it point to gfn2 spte = (gfn1, pfn1) page-fault on gpte intercept the page-fault, then want to update spte to (gfn2, pfn2) in mmu_set_spte, we can detect pfn2 != pfn1, then drop it. Hmm, the interesting thing is what if different gfns map to the same pfn. For example, spte1 is shadowed by gfn1 and spte2 is shadowed by pfn2, both gfn1 and gfn2 map to pfn, the code (including the current code) will set spte1 to the gfn2's rmap and spte2 to the gfn1's rmap. But i think it is ok. Current code updates gfn properly in set_spte by page_header_update_slot. Better keep state properly. Okay, i will not change the position of page_header_update_slot in the next version. Thank you, Marcelo! -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH v5] kvm/fpu: Enable fully eager restore kvm FPU
-Original Message- From: Marcelo Tosatti [mailto:mtosa...@redhat.com] Sent: Wednesday, November 21, 2012 6:00 AM To: Hao, Xudong Cc: a...@redhat.com; kvm@vger.kernel.org Subject: Re: [PATCH v5] kvm/fpu: Enable fully eager restore kvm FPU On Wed, Nov 07, 2012 at 10:01:11AM +0800, Xudong Hao wrote: Romove fpu lazy restore logic, using eager restore totally. v5 changes from v4: - remove lazy fpu restore totally, fpu eager restore does not have performance regression and simple the code. v4 changes from v3: - Wrap up some confused code with a clear function lazy_fpu_allowed() - Update fpu while update cr4 too. v3 changes from v2: - Make fpu active explicitly while guest xsave is enabling and non-lazy xstate bit exist. v2 changes from v1: - Expand KVM_XSTATE_LAZY to 64 bits before negating it. Signed-off-by: Xudong Hao xudong@intel.com --- arch/x86/kvm/vmx.c | 9 ++--- arch/x86/kvm/x86.c | 8 +--- include/linux/kvm_host.h | 1 - 3 files changed, 3 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6599e45..c1fd2e1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1197,7 +1197,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) u32 eb; eb = (1u PF_VECTOR) | (1u UD_VECTOR) | (1u MC_VECTOR) | -(1u NM_VECTOR) | (1u DB_VECTOR); +(1u DB_VECTOR); if ((vcpu-guest_debug (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) Please remove the code entirely, including: if (is_no_device(intr_info)) { vmx_fpu_activate(vcpu); return 1; } and clts handling. fpu_active/fpu_deactivate callbacks become unused, don't they? Also remove fpu_active variable. Okay, will remove all of these. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Guest performance is reduced after live migration
On 11/21/2012 09:25 AM, shouta.ueh...@jp.yokogawa.com wrote: Dear allI I continue watching a mailing list whether a similar problem is reported because a problem does not seem to happen to others. Any information, however small, would be appreciated. I am digging into it, but did not get useful hint so far... -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] vhost-blk: Add vhost-blk support v5
On 11/20/2012 09:37 PM, Michael S. Tsirkin wrote: On Tue, Nov 20, 2012 at 02:39:40PM +0800, Asias He wrote: On 11/20/2012 04:26 AM, Michael S. Tsirkin wrote: On Mon, Nov 19, 2012 at 04:53:42PM +0800, Asias He wrote: vhost-blk is an in-kernel virito-blk device accelerator. Due to lack of proper in-kernel AIO interface, this version converts guest's I/O request to bio and use submit_bio() to submit I/O directly. So this version any supports raw block device as guest's disk image, e.g. /dev/sda, /dev/ram0. We can add file based image support to vhost-blk once we have in-kernel AIO interface. There are some work in progress for in-kernel AIO interface from Dave Kleikamp and Zach Brown: http://marc.info/?l=linux-fsdevelm=133312234313122 Performance evaluation: - 1) LKVM Fio with libaio ioengine on Fusion IO device using kvm tool IOPS(k)Before After Improvement seq-read 107 121 +13.0% seq-write 130 179 +37.6% rnd-read 102 122 +19.6% rnd-write 125 159 +27.0% 2) QEMU Fio with libaio ioengine on Fusion IO device using QEMU IOPS(k)Before After Improvement seq-read 76 123 +61.8% seq-write 139 173 +24.4% rnd-read 73 120 +64.3% rnd-write 75 156 +108.0% Could you compare with dataplane qemu as well please? Well, I will try to collect it. Userspace bits: - 1) LKVM The latest vhost-blk userspace bits for kvm tool can be found here: g...@github.com:asias/linux-kvm.git blk.vhost-blk 2) QEMU The latest vhost-blk userspace prototype for QEMU can be found here: g...@github.com:asias/qemu.git blk.vhost-blk Changes in v5: - Do not assume the buffer layout - Fix wakeup race Changes in v4: - Mark req-status as userspace pointer - Use __copy_to_user() instead of copy_to_user() in vhost_blk_set_status() - Add if (need_resched()) schedule() in blk thread - Kill vhost_blk_stop_vq() and move it into vhost_blk_stop() - Use vq_err() instead of pr_warn() - Fail un Unsupported request - Add flush in vhost_blk_set_features() Changes in v3: - Sending REQ_FLUSH bio instead of vfs_fsync, thanks Christoph! - Check file passed by user is a raw block device file Signed-off-by: Asias He as...@redhat.com Since there are files shared by this and vhost net it's easiest for me to merge this all through the vhost tree. Jens, could you ack this and the bio usage in this driver please? --- drivers/vhost/Kconfig | 1 + drivers/vhost/Kconfig.blk | 10 + drivers/vhost/Makefile| 2 + drivers/vhost/blk.c | 697 ++ drivers/vhost/blk.h | 8 + 5 files changed, 718 insertions(+) create mode 100644 drivers/vhost/Kconfig.blk create mode 100644 drivers/vhost/blk.c create mode 100644 drivers/vhost/blk.h diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 202bba6..acd8038 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -11,4 +11,5 @@ config VHOST_NET if STAGING source drivers/vhost/Kconfig.tcm +source drivers/vhost/Kconfig.blk endif diff --git a/drivers/vhost/Kconfig.blk b/drivers/vhost/Kconfig.blk new file mode 100644 index 000..ff8ab76 --- /dev/null +++ b/drivers/vhost/Kconfig.blk @@ -0,0 +1,10 @@ +config VHOST_BLK + tristate Host kernel accelerator for virtio blk (EXPERIMENTAL) + depends on BLOCK EXPERIMENTAL m + ---help--- +This kernel module can be loaded in host kernel to accelerate +guest block with virtio_blk. Not to be confused with virtio_blk +module itself which needs to be loaded in guest kernel. + +To compile this driver as a module, choose M here: the module will +be called vhost_blk. diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile index a27b053..1a8a4a5 100644 --- a/drivers/vhost/Makefile +++ b/drivers/vhost/Makefile @@ -2,3 +2,5 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o vhost_net-y := vhost.o net.o obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o +obj-$(CONFIG_VHOST_BLK) += vhost_blk.o +vhost_blk-y := blk.o diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c new file mode 100644 index 000..f0f118a --- /dev/null +++ b/drivers/vhost/blk.c @@ -0,0 +1,697 @@ +/* + * Copyright (C) 2011 Taobao, Inc. + * Author: Liu Yuan tailai...@taobao.com + * + * Copyright (C) 2012 Red Hat, Inc. + * Author: Asias He as...@redhat.com + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * virtio-blk server in host kernel. + */ + +#include linux/miscdevice.h +#include linux/module.h +#include linux/vhost.h +#include linux/virtio_blk.h +#include linux/mutex.h +#include linux/file.h +#include linux/kthread.h +#include linux/blkdev.h +#include linux/llist.h + +#include vhost.c +#include vhost.h +#include blk.h + +static
Re: messed up with xml-files and configuration of a VM
On Tue, Nov 20, 2012 at 5:13 PM, Lentes, Bernd bernd.len...@helmholtz-muenchen.de wrote: first, i'm new to kvm. I'm running KVM on a sles 11 sp2, kernel 3.0.13-0.27-default. My guest is an Ubuntu 12.0.4 LTS 64bit. The guest has attached a CDROM, using an iso-file from a CIFS-Share. I detached it with the virtual machine manager (0.9.0). I don't see the cd-rom anymore in the virtual machine manager. But when i try to start the vm, it complains about the missing iso-file. Why ? I detached it. When i like to have a look in the xml-files of the guest, i found three ! One in /var/lib/kvm/images, one in /etc/libvirt/qemu and one in /etc/kvm/vm. Which one should i use to configure the vm ? In the one in /etc/libvirt/qemu the cifs-share isn't mentioned any longer, in the other two it is still. Is it possible to configure the vm editing one of the XML-files ? Or shall i use virsh ? Using virsh, does the vm has to be stopped or can i edit the configuration for a running vm ? Why three xml-files ? Why is detaching with the virtual machine manager not working ? Hi Bernd, This is a libvirt question, I have CCed the libvirt mailing list. Do not edit the XML files on disk. Instead, use virsh edit (to modify) and virsh dumpxml (to view). Stefan -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: qemu-kvm-1.2.0: double free or corruption
On Mon, Nov 19, 2012 at 8:56 AM, Nikola Ciprich nikola.cipr...@linuxbox.cz wrote: on one of our servers, windows 2008 KVM suddenly crashed. I see following in libvirt log: *** glibc detected *** /usr/bin/qemu-kvm: double free or corruption (!prev): 0x7fc634008cd0 *** === Backtrace: = /lib64/libc.so.6(+0x75916)[0x7fc9026f4916] /lib64/libc.so.6(+0x78443)[0x7fc9026f7443] /usr/bin/qemu-kvm(+0x1faeb1)[0x7fc907187eb1] /usr/bin/qemu-kvm(+0x1f0e1a)[0x7fc90717de1a] /usr/bin/qemu-kvm(+0x1fb681)[0x7fc907188681] /usr/bin/qemu-kvm(+0xed6a7)[0x7fc90707a6a7] /usr/bin/qemu-kvm(+0x195c31)[0x7fc907122c31] /usr/bin/qemu-kvm(main+0x106c)[0x7fc90711e5fc] /lib64/libc.so.6(__libc_start_main+0xfd)[0x7fc90269dcdd] /usr/bin/qemu-kvm(+0x749f9)[0x7fc9070019f9] [...] I guess this is not of much use, since I didn't have debuginfo package installed in time of crash. Is it possible to obtain more debuginfo after I installed it? Is there something else I should check to find where the problem could be? No problem, you can still resolve symbols afterwards. Download the debuginfo package and use something along the lines of: $ addr2line -e /path/to/debug-executable 0x1faeb1 0x1f0e1a 0x1fb681 0xed6a7 0x195c31 It's important to fetch the debuginfo package for the exact same version of the qemu RPM you were running. Stefan -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: qemu-kvm-1.2.0: double free or corruption in VNC code
Hello Stefan, thanks! here it goes.. *** glibc detected *** /usr/bin/qemu-kvm: double free or corruption (!prev): 0x7fc634008cd0 *** === Backtrace: = /lib64/libc.so.6(+0x75916)[0x7fc9026f4916] /lib64/libc.so.6(+0x78443)[0x7fc9026f7443] /usr/bin/qemu-kvm(+0x1faeb1)[0x7fc907187eb1] /usr/bin/qemu-kvm(+0x1f0e1a)[0x7fc90717de1a] /usr/bin/qemu-kvm(+0x1fb681)[0x7fc907188681] /usr/bin/qemu-kvm(+0xed6a7)[0x7fc90707a6a7] /usr/bin/qemu-kvm(+0x195c31)[0x7fc907122c31] /usr/bin/qemu-kvm(main+0x106c)[0x7fc90711e5fc] /lib64/libc.so.6(__libc_start_main+0xfd)[0x7fc90269dcdd] /usr/bin/qemu-kvm(+0x749f9)[0x7fc9070019f9] [...] [root@blg qemu-kvm-1.2.0]# addr2line -e /usr/lib/debug/usr/bin/qemu-kvm.debug 0x1faeb1 0x1f0e1a 0x1fb681 0xed6a7 0x195c31 0x106c /usr/src/debug/qemu-kvm-1.2.0/ui/vnc.c:499 /usr/src/debug/qemu-kvm-1.2.0/ui/vnc-enc-zrle.c:364 /usr/src/debug/qemu-kvm-1.2.0/ui/vnc.c:1037 /usr/src/debug/qemu-kvm-1.2.0/iohandler.c:159 /usr/src/debug/qemu-kvm-1.2.0/main-loop.c:499 ??:0 this makes some sense to me, since it crashed while there was VNC connection active.. It's important to fetch the debuginfo package for the exact same version of the qemu RPM you were running. sure, it's the same version. BR nik -- - Ing. Nikola CIPRICH LinuxBox.cz, s.r.o. 28.rijna 168, 709 00 Ostrava tel.: +420 591 166 214 fax:+420 596 621 273 mobil: +420 777 093 799 www.linuxbox.cz mobil servis: +420 737 238 656 email servis: ser...@linuxbox.cz - pgpCg25xs1tyP.pgp Description: PGP signature
[PATCH v2 2/3] KVM: PPC: Book3S HV: Make a HPTE removal function available
This makes a HPTE removal function, kvmppc_do_h_remove(), available outside book3s_hv_rm_mmu.c. This will be used by the HPT writing code. Signed-off-by: Paul Mackerras pau...@samba.org --- v2: basically unchanged from v1, just rediffed arch/powerpc/include/asm/kvm_book3s.h |3 +++ arch/powerpc/kvm/book3s_hv_rm_mmu.c | 19 +-- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index fea768f..46763d10 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -160,6 +160,9 @@ extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel, pgd_t *pgdir, bool realmode, unsigned long *idx_ret); +extern long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, + unsigned long pte_index, unsigned long avpn, + unsigned long *hpret); extern long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long *map); diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index a96f90a..2334000 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -365,11 +365,10 @@ static inline int try_lock_tlbie(unsigned int *lock) return old == 0; } -long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, -unsigned long pte_index, unsigned long avpn, -unsigned long va) +long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, + unsigned long pte_index, unsigned long avpn, + unsigned long *hpret) { - struct kvm *kvm = vcpu-kvm; unsigned long *hpte; unsigned long v, r, rb; struct revmap_entry *rev; @@ -411,10 +410,18 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, note_hpte_modification(kvm, rev); unlock_hpte(hpte, 0); - vcpu-arch.gpr[4] = v; - vcpu-arch.gpr[5] = r; + hpret[0] = v; + hpret[1] = r; return H_SUCCESS; } +EXPORT_SYMBOL_GPL(kvmppc_do_h_remove); + +long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, +unsigned long pte_index, unsigned long avpn) +{ + return kvmppc_do_h_remove(vcpu-kvm, flags, pte_index, avpn, + vcpu-arch.gpr[4]); +} long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) { -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 1/3] KVM: PPC: Book3S HV: Add a mechanism for recording modified HPTEs
This uses a bit in our record of the guest view of the HPTE to record when the HPTE gets modified. We use a reserved bit for this, and ensure that this bit is always cleared in HPTE values returned to the guest. The recording of modified HPTEs is only done if other code indicates its interest by setting kvm-arch.hpte_mod_interest to a non-zero value. The reason for this is that when later commits add facilities for userspace to read the HPT, the first pass of reading the HPT will be quicker if there are no (or very few) HPTEs marked as modified, rather than having most HPTEs marked as modified. Signed-off-by: Paul Mackerras pau...@samba.org --- v2: added HPTE_GR_RESERVED, clear those bits in H_ENTER arch/powerpc/include/asm/kvm_book3s_64.h |9 + arch/powerpc/include/asm/kvm_host.h |1 + arch/powerpc/kvm/book3s_hv_rm_mmu.c | 28 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 1472a5b..b322e5b 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -50,6 +50,15 @@ extern int kvm_hpt_order;/* order of preallocated HPTs */ #define HPTE_V_HVLOCK 0x40UL #define HPTE_V_ABSENT 0x20UL +/* + * We use this bit in the guest_rpte field of the revmap entry + * to indicate a modified HPTE. + */ +#define HPTE_GR_MODIFIED (1ul 62) + +/* These bits are reserved in the guest view of the HPTE */ +#define HPTE_GR_RESERVED HPTE_GR_MODIFIED + static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits) { unsigned long tmp, old; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 3093896..58c7264 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -248,6 +248,7 @@ struct kvm_arch { atomic_t vcpus_running; unsigned long hpt_npte; unsigned long hpt_mask; + atomic_t hpte_mod_interest; spinlock_t slot_phys_lock; unsigned short last_vcpu[NR_CPUS]; struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 362dffe..a96f90a 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -66,6 +66,17 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, } EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); +/* + * Note modification of an HPTE; set the HPTE modified bit + * if anyone is interested. + */ +static inline void note_hpte_modification(struct kvm *kvm, + struct revmap_entry *rev) +{ + if (atomic_read(kvm-arch.hpte_mod_interest)) + rev-guest_rpte |= HPTE_GR_MODIFIED; +} + /* Remove this HPTE from the chain for a real page */ static void remove_revmap_chain(struct kvm *kvm, long pte_index, struct revmap_entry *rev, @@ -138,7 +149,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, unsigned long slot_fn, hva; unsigned long *hpte; struct revmap_entry *rev; - unsigned long g_ptel = ptel; + unsigned long g_ptel; struct kvm_memory_slot *memslot; unsigned long *physp, pte_size; unsigned long is_io; @@ -153,6 +164,8 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, return H_PARAMETER; writing = hpte_is_writable(ptel); pteh = ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); + ptel = ~HPTE_GR_RESERVED; + g_ptel = ptel; /* used later to detect if we might have been invalidated */ mmu_seq = kvm-mmu_notifier_seq; @@ -287,8 +300,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, rev = kvm-arch.revmap[pte_index]; if (realmode) rev = real_vmalloc_addr(rev); - if (rev) + if (rev) { rev-guest_rpte = g_ptel; + note_hpte_modification(kvm, rev); + } /* Link HPTE into reverse-map chain */ if (pteh HPTE_V_VALID) { @@ -392,7 +407,8 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, /* Read PTE low word after tlbie to get final R/C values */ remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]); } - r = rev-guest_rpte; + r = rev-guest_rpte ~HPTE_GR_RESERVED; + note_hpte_modification(kvm, rev); unlock_hpte(hpte, 0); vcpu-arch.gpr[4] = v; @@ -466,6 +482,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) args[j] = ((0x80 | flags) 56) + pte_index; rev = real_vmalloc_addr(kvm-arch.revmap[pte_index]); + note_hpte_modification(kvm, rev); if (!(hp[0] HPTE_V_VALID)) {
[PATCH v2 3/3] KVM: PPC: Book3S HV: Provide a method for userspace to read and write the HPT
A new ioctl, KVM_PPC_GET_HTAB_FD, returns a file descriptor. Reads on this fd return the contents of the HPT (hashed page table), writes create and/or remove entries in the HPT. There is a new capability, KVM_CAP_PPC_HTAB_FD, to indicate the presence of the ioctl. The ioctl takes an argument structure with the index of the first HPT entry to read out and a set of flags. The flags indicate whether the user is intending to read or write the HPT, and whether to return all entries or only the bolted entries (those with the bolted bit, 0x10, set in the first doubleword). This is intended for use in implementing qemu's savevm/loadvm and for live migration. Therefore, on reads, the first pass returns information about all HPTEs (or all bolted HPTEs). When the first pass reaches the end of the HPT, it returns from the read. Subsequent reads only return information about HPTEs that have changed since they were last read. A read that finds no changed HPTEs in the HPT following where the last read finished will return 0 bytes. The format of the data provides a simple run-length compression of the invalid entries. Each block of data starts with a header that indicates the index (position in the HPT, which is just an array), the number of valid entries starting at that index (may be zero), and the number of invalid entries following those valid entries. The valid entries, 16 bytes each, follow the header. The invalid entries are not explicitly represented. Signed-off-by: Paul Mackerras pau...@samba.org --- v2: added comments, added reserved field in struct kvm_get_htab_fd Documentation/virtual/kvm/api.txt| 53 + arch/powerpc/include/asm/kvm_book3s_64.h | 22 ++ arch/powerpc/include/asm/kvm_ppc.h |2 + arch/powerpc/include/uapi/asm/kvm.h | 25 +++ arch/powerpc/kvm/book3s_64_mmu_hv.c | 344 ++ arch/powerpc/kvm/book3s_hv.c | 12 -- arch/powerpc/kvm/powerpc.c | 17 ++ include/uapi/linux/kvm.h |3 + 8 files changed, 466 insertions(+), 12 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 6671fdc..33080ea 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2071,6 +2071,59 @@ KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm Note that the vcpu ioctl is asynchronous to vcpu execution. +4.78 KVM_PPC_GET_HTAB_FD + +Capability: KVM_CAP_PPC_HTAB_FD +Architectures: powerpc +Type: vm ioctl +Parameters: Pointer to struct kvm_get_htab_fd (in) +Returns: file descriptor number (= 0) on success, -1 on error + +This returns a file descriptor that can be used either to read out the +entries in the guest's hashed page table (HPT), or to write entries to +initialize the HPT. The returned fd can only be written to if the +KVM_GET_HTAB_WRITE bit is set in the flags field of the argument, and +can only be read if that bit is clear. The argument struct looks like +this: + +/* For KVM_PPC_GET_HTAB_FD */ +struct kvm_get_htab_fd { + __u64 flags; + __u64 start_index; +}; + +/* Values for kvm_get_htab_fd.flags */ +#define KVM_GET_HTAB_BOLTED_ONLY ((__u64)0x1) +#define KVM_GET_HTAB_WRITE ((__u64)0x2) + +The `start_index' field gives the index in the HPT of the entry at +which to start reading. It is ignored when writing. + +Reads on the fd will initially supply information about all +interesting HPT entries. Interesting entries are those with the +bolted bit set, if the KVM_GET_HTAB_BOLTED_ONLY bit is set, otherwise +all entries. When the end of the HPT is reached, the read() will +return. If read() is called again on the fd, it will start again from +the beginning of the HPT, but will only return HPT entries that have +changed since they were last read. + +Data read or written is structured as a header (8 bytes) followed by a +series of valid HPT entries (16 bytes) each. The header indicates how +many valid HPT entries there are and how many invalid entries follow +the valid entries. The invalid entries are not represented explicitly +in the stream. The header format is: + +struct kvm_get_htab_header { + __u32 index; + __u16 n_valid; + __u16 n_invalid; +}; + +Writes to the fd create HPT entries starting at the index given in the +header; first `n_valid' valid entries with contents from the data +written, then `n_invalid' invalid entries, invalidating any previously +valid entries found. + 5. The kvm_run structure diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index b322e5b..38bec1d 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -246,4 +246,26 @@ static inline bool slot_is_aligned(struct kvm_memory_slot *memslot, return !(memslot-base_gfn mask) !(memslot-npages mask); } +/* + * This
[PATCH] KVM: PPC: Book3S HV: Fix bug causing loss of page dirty state
This fixes a bug where adding a new guest HPT entry via the H_ENTER hcall would lose the changed bit in the reverse map information for the guest physical page being mapped. The result was that the KVM_GET_DIRTY_LOG could return a zero bit for the page even though the page had been modified by the guest. This fixes it by only modifying the index and present bits in the reverse map entry, thus preserving the reference and change bits. We were also unnecessarily setting the reference bit, and this fixes that too. Signed-off-by: Paul Mackerras pau...@samba.org --- This is against Alex Graf's kvm-ppc-next branch plus the series of three patches I just sent, but it should be independent of that series. arch/powerpc/kvm/book3s_hv_rm_mmu.c |6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 2334000..fc3da32 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -59,10 +59,10 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, head-back = pte_index; } else { rev-forw = rev-back = pte_index; - i = pte_index; + *rmap = (*rmap ~KVMPPC_RMAP_INDEX) | + pte_index | KVMPPC_RMAP_PRESENT; } - smp_wmb(); - *rmap = i | KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT; /* unlock */ + unlock_rmap(rmap); } EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: PPC: Book3S HV: Fix bug causing loss of page dirty state
On 20.11.2012, at 10:01, Paul Mackerras wrote: This fixes a bug where adding a new guest HPT entry via the H_ENTER hcall would lose the changed bit in the reverse map information for the guest physical page being mapped. The result was that the KVM_GET_DIRTY_LOG could return a zero bit for the page even though the page had been modified by the guest. This fixes it by only modifying the index and present bits in the reverse map entry, thus preserving the reference and change bits. We were also unnecessarily setting the reference bit, and this fixes that too. Signed-off-by: Paul Mackerras pau...@samba.org Thanks, applied to kvm-ppc-next. Alex --- This is against Alex Graf's kvm-ppc-next branch plus the series of three patches I just sent, but it should be independent of that series. arch/powerpc/kvm/book3s_hv_rm_mmu.c |6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 2334000..fc3da32 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -59,10 +59,10 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, head-back = pte_index; } else { rev-forw = rev-back = pte_index; - i = pte_index; + *rmap = (*rmap ~KVMPPC_RMAP_INDEX) | + pte_index | KVMPPC_RMAP_PRESENT; } - smp_wmb(); - *rmap = i | KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT; /* unlock */ + unlock_rmap(rmap); } EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 3/3] KVM: PPC: Book3S HV: Provide a method for userspace to read and write the HPT
On 20.11.2012, at 09:57, Paul Mackerras wrote: A new ioctl, KVM_PPC_GET_HTAB_FD, returns a file descriptor. Reads on this fd return the contents of the HPT (hashed page table), writes create and/or remove entries in the HPT. There is a new capability, KVM_CAP_PPC_HTAB_FD, to indicate the presence of the ioctl. The ioctl takes an argument structure with the index of the first HPT entry to read out and a set of flags. The flags indicate whether the user is intending to read or write the HPT, and whether to return all entries or only the bolted entries (those with the bolted bit, 0x10, set in the first doubleword). This is intended for use in implementing qemu's savevm/loadvm and for live migration. Therefore, on reads, the first pass returns information about all HPTEs (or all bolted HPTEs). When the first pass reaches the end of the HPT, it returns from the read. Subsequent reads only return information about HPTEs that have changed since they were last read. A read that finds no changed HPTEs in the HPT following where the last read finished will return 0 bytes. The format of the data provides a simple run-length compression of the invalid entries. Each block of data starts with a header that indicates the index (position in the HPT, which is just an array), the number of valid entries starting at that index (may be zero), and the number of invalid entries following those valid entries. The valid entries, 16 bytes each, follow the header. The invalid entries are not explicitly represented. Signed-off-by: Paul Mackerras pau...@samba.org --- v2: added comments, added reserved field in struct kvm_get_htab_fd Documentation/virtual/kvm/api.txt| 53 + arch/powerpc/include/asm/kvm_book3s_64.h | 22 ++ arch/powerpc/include/asm/kvm_ppc.h |2 + arch/powerpc/include/uapi/asm/kvm.h | 25 +++ arch/powerpc/kvm/book3s_64_mmu_hv.c | 344 ++ arch/powerpc/kvm/book3s_hv.c | 12 -- arch/powerpc/kvm/powerpc.c | 17 ++ include/uapi/linux/kvm.h |3 + 8 files changed, 466 insertions(+), 12 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 6671fdc..33080ea 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2071,6 +2071,59 @@ KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm Note that the vcpu ioctl is asynchronous to vcpu execution. +4.78 KVM_PPC_GET_HTAB_FD + +Capability: KVM_CAP_PPC_HTAB_FD +Architectures: powerpc +Type: vm ioctl +Parameters: Pointer to struct kvm_get_htab_fd (in) +Returns: file descriptor number (= 0) on success, -1 on error + +This returns a file descriptor that can be used either to read out the +entries in the guest's hashed page table (HPT), or to write entries to +initialize the HPT. The returned fd can only be written to if the +KVM_GET_HTAB_WRITE bit is set in the flags field of the argument, and +can only be read if that bit is clear. The argument struct looks like +this: + +/* For KVM_PPC_GET_HTAB_FD */ +struct kvm_get_htab_fd { + __u64 flags; + __u64 start_index; Documentation is out of sync :). Applied all 3 with fixed documentation. Alex -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html