[PATCH v2 2/3] KVM: PPC: Book3S HV: Make a HPTE removal function available

2012-11-20 Thread Paul Mackerras
This makes a HPTE removal function, kvmppc_do_h_remove(), available
outside book3s_hv_rm_mmu.c.  This will be used by the HPT writing
code.

Signed-off-by: Paul Mackerras pau...@samba.org
---
v2: basically unchanged from v1, just rediffed

 arch/powerpc/include/asm/kvm_book3s.h |3 +++
 arch/powerpc/kvm/book3s_hv_rm_mmu.c   |   19 +--
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index fea768f..46763d10 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -160,6 +160,9 @@ extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, 
unsigned long flags,
 extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
long pte_index, unsigned long pteh, unsigned long ptel,
pgd_t *pgdir, bool realmode, unsigned long *idx_ret);
+extern long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
+   unsigned long pte_index, unsigned long avpn,
+   unsigned long *hpret);
 extern long kvmppc_hv_get_dirty_log(struct kvm *kvm,
struct kvm_memory_slot *memslot, unsigned long *map);
 
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index a96f90a..2334000 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -365,11 +365,10 @@ static inline int try_lock_tlbie(unsigned int *lock)
return old == 0;
 }
 
-long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
-unsigned long pte_index, unsigned long avpn,
-unsigned long va)
+long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
+   unsigned long pte_index, unsigned long avpn,
+   unsigned long *hpret)
 {
-   struct kvm *kvm = vcpu-kvm;
unsigned long *hpte;
unsigned long v, r, rb;
struct revmap_entry *rev;
@@ -411,10 +410,18 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long 
flags,
note_hpte_modification(kvm, rev);
unlock_hpte(hpte, 0);
 
-   vcpu-arch.gpr[4] = v;
-   vcpu-arch.gpr[5] = r;
+   hpret[0] = v;
+   hpret[1] = r;
return H_SUCCESS;
 }
+EXPORT_SYMBOL_GPL(kvmppc_do_h_remove);
+
+long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
+unsigned long pte_index, unsigned long avpn)
+{
+   return kvmppc_do_h_remove(vcpu-kvm, flags, pte_index, avpn,
+ vcpu-arch.gpr[4]);
+}
 
 long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 {
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 1/3] KVM: PPC: Book3S HV: Add a mechanism for recording modified HPTEs

2012-11-20 Thread Paul Mackerras
This uses a bit in our record of the guest view of the HPTE to record
when the HPTE gets modified.  We use a reserved bit for this, and ensure
that this bit is always cleared in HPTE values returned to the guest.

The recording of modified HPTEs is only done if other code indicates
its interest by setting kvm-arch.hpte_mod_interest to a non-zero value.
The reason for this is that when later commits add facilities for
userspace to read the HPT, the first pass of reading the HPT will be
quicker if there are no (or very few) HPTEs marked as modified,
rather than having most HPTEs marked as modified.

Signed-off-by: Paul Mackerras pau...@samba.org
---
v2: added HPTE_GR_RESERVED, clear those bits in H_ENTER

 arch/powerpc/include/asm/kvm_book3s_64.h |9 +
 arch/powerpc/include/asm/kvm_host.h  |1 +
 arch/powerpc/kvm/book3s_hv_rm_mmu.c  |   28 
 3 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index 1472a5b..b322e5b 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -50,6 +50,15 @@ extern int kvm_hpt_order;/* order of 
preallocated HPTs */
 #define HPTE_V_HVLOCK  0x40UL
 #define HPTE_V_ABSENT  0x20UL
 
+/*
+ * We use this bit in the guest_rpte field of the revmap entry
+ * to indicate a modified HPTE.
+ */
+#define HPTE_GR_MODIFIED   (1ul  62)
+
+/* These bits are reserved in the guest view of the HPTE */
+#define HPTE_GR_RESERVED   HPTE_GR_MODIFIED
+
 static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
 {
unsigned long tmp, old;
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 3093896..58c7264 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -248,6 +248,7 @@ struct kvm_arch {
atomic_t vcpus_running;
unsigned long hpt_npte;
unsigned long hpt_mask;
+   atomic_t hpte_mod_interest;
spinlock_t slot_phys_lock;
unsigned short last_vcpu[NR_CPUS];
struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 362dffe..a96f90a 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -66,6 +66,17 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct 
revmap_entry *rev,
 }
 EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
 
+/*
+ * Note modification of an HPTE; set the HPTE modified bit
+ * if anyone is interested.
+ */
+static inline void note_hpte_modification(struct kvm *kvm,
+ struct revmap_entry *rev)
+{
+   if (atomic_read(kvm-arch.hpte_mod_interest))
+   rev-guest_rpte |= HPTE_GR_MODIFIED;
+}
+
 /* Remove this HPTE from the chain for a real page */
 static void remove_revmap_chain(struct kvm *kvm, long pte_index,
struct revmap_entry *rev,
@@ -138,7 +149,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
unsigned long slot_fn, hva;
unsigned long *hpte;
struct revmap_entry *rev;
-   unsigned long g_ptel = ptel;
+   unsigned long g_ptel;
struct kvm_memory_slot *memslot;
unsigned long *physp, pte_size;
unsigned long is_io;
@@ -153,6 +164,8 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
return H_PARAMETER;
writing = hpte_is_writable(ptel);
pteh = ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
+   ptel = ~HPTE_GR_RESERVED;
+   g_ptel = ptel;
 
/* used later to detect if we might have been invalidated */
mmu_seq = kvm-mmu_notifier_seq;
@@ -287,8 +300,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long 
flags,
rev = kvm-arch.revmap[pte_index];
if (realmode)
rev = real_vmalloc_addr(rev);
-   if (rev)
+   if (rev) {
rev-guest_rpte = g_ptel;
+   note_hpte_modification(kvm, rev);
+   }
 
/* Link HPTE into reverse-map chain */
if (pteh  HPTE_V_VALID) {
@@ -392,7 +407,8 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long 
flags,
/* Read PTE low word after tlbie to get final R/C values */
remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]);
}
-   r = rev-guest_rpte;
+   r = rev-guest_rpte  ~HPTE_GR_RESERVED;
+   note_hpte_modification(kvm, rev);
unlock_hpte(hpte, 0);
 
vcpu-arch.gpr[4] = v;
@@ -466,6 +482,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 
args[j] = ((0x80 | flags)  56) + pte_index;
rev = real_vmalloc_addr(kvm-arch.revmap[pte_index]);
+   note_hpte_modification(kvm, rev);
 
if (!(hp[0]  HPTE_V_VALID)) {
 

[PATCH v2 3/3] KVM: PPC: Book3S HV: Provide a method for userspace to read and write the HPT

2012-11-20 Thread Paul Mackerras
A new ioctl, KVM_PPC_GET_HTAB_FD, returns a file descriptor.  Reads on
this fd return the contents of the HPT (hashed page table), writes
create and/or remove entries in the HPT.  There is a new capability,
KVM_CAP_PPC_HTAB_FD, to indicate the presence of the ioctl.  The ioctl
takes an argument structure with the index of the first HPT entry to
read out and a set of flags.  The flags indicate whether the user is
intending to read or write the HPT, and whether to return all entries
or only the bolted entries (those with the bolted bit, 0x10, set in
the first doubleword).

This is intended for use in implementing qemu's savevm/loadvm and for
live migration.  Therefore, on reads, the first pass returns information
about all HPTEs (or all bolted HPTEs).  When the first pass reaches the
end of the HPT, it returns from the read.  Subsequent reads only return
information about HPTEs that have changed since they were last read.
A read that finds no changed HPTEs in the HPT following where the last
read finished will return 0 bytes.

The format of the data provides a simple run-length compression of the
invalid entries.  Each block of data starts with a header that indicates
the index (position in the HPT, which is just an array), the number of
valid entries starting at that index (may be zero), and the number of
invalid entries following those valid entries.  The valid entries, 16
bytes each, follow the header.  The invalid entries are not explicitly
represented.

Signed-off-by: Paul Mackerras pau...@samba.org
---
v2: added comments, added reserved field in struct kvm_get_htab_fd

 Documentation/virtual/kvm/api.txt|   53 +
 arch/powerpc/include/asm/kvm_book3s_64.h |   22 ++
 arch/powerpc/include/asm/kvm_ppc.h   |2 +
 arch/powerpc/include/uapi/asm/kvm.h  |   25 +++
 arch/powerpc/kvm/book3s_64_mmu_hv.c  |  344 ++
 arch/powerpc/kvm/book3s_hv.c |   12 --
 arch/powerpc/kvm/powerpc.c   |   17 ++
 include/uapi/linux/kvm.h |3 +
 8 files changed, 466 insertions(+), 12 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 6671fdc..33080ea 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2071,6 +2071,59 @@ KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; 
source cpu in parm
 
 Note that the vcpu ioctl is asynchronous to vcpu execution.
 
+4.78 KVM_PPC_GET_HTAB_FD
+
+Capability: KVM_CAP_PPC_HTAB_FD
+Architectures: powerpc
+Type: vm ioctl
+Parameters: Pointer to struct kvm_get_htab_fd (in)
+Returns: file descriptor number (= 0) on success, -1 on error
+
+This returns a file descriptor that can be used either to read out the
+entries in the guest's hashed page table (HPT), or to write entries to
+initialize the HPT.  The returned fd can only be written to if the
+KVM_GET_HTAB_WRITE bit is set in the flags field of the argument, and
+can only be read if that bit is clear.  The argument struct looks like
+this:
+
+/* For KVM_PPC_GET_HTAB_FD */
+struct kvm_get_htab_fd {
+   __u64   flags;
+   __u64   start_index;
+};
+
+/* Values for kvm_get_htab_fd.flags */
+#define KVM_GET_HTAB_BOLTED_ONLY   ((__u64)0x1)
+#define KVM_GET_HTAB_WRITE ((__u64)0x2)
+
+The `start_index' field gives the index in the HPT of the entry at
+which to start reading.  It is ignored when writing.
+
+Reads on the fd will initially supply information about all
+interesting HPT entries.  Interesting entries are those with the
+bolted bit set, if the KVM_GET_HTAB_BOLTED_ONLY bit is set, otherwise
+all entries.  When the end of the HPT is reached, the read() will
+return.  If read() is called again on the fd, it will start again from
+the beginning of the HPT, but will only return HPT entries that have
+changed since they were last read.
+
+Data read or written is structured as a header (8 bytes) followed by a
+series of valid HPT entries (16 bytes) each.  The header indicates how
+many valid HPT entries there are and how many invalid entries follow
+the valid entries.  The invalid entries are not represented explicitly
+in the stream.  The header format is:
+
+struct kvm_get_htab_header {
+   __u32   index;
+   __u16   n_valid;
+   __u16   n_invalid;
+};
+
+Writes to the fd create HPT entries starting at the index given in the
+header; first `n_valid' valid entries with contents from the data
+written, then `n_invalid' invalid entries, invalidating any previously
+valid entries found.
+
 
 5. The kvm_run structure
 
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index b322e5b..38bec1d 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -246,4 +246,26 @@ static inline bool slot_is_aligned(struct kvm_memory_slot 
*memslot,
return !(memslot-base_gfn  mask)  !(memslot-npages  mask);
 }
 
+/*
+ * This 

Re: [patch 18/18] KVM: x86: update pvclock area conditionally, on cpu migration

2012-11-20 Thread Glauber Costa
On 11/20/2012 01:58 AM, Marcelo Tosatti wrote:
 As requested by Glauber, do not update kvmclock area on vcpu-pcpu 
 migration, in case the host has stable TSC. 
 
 This is to reduce cacheline bouncing.
 
 Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
 
 Index: vsyscall/arch/x86/kvm/x86.c
 ===
 --- vsyscall.orig/arch/x86/kvm/x86.c
 +++ vsyscall/arch/x86/kvm/x86.c
 @@ -2615,7 +2615,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu 
   kvm_x86_ops-write_tsc_offset(vcpu, offset);
   vcpu-arch.tsc_catchup = 1;
   }
 - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 + /*
 +  * On a host with synchronized TSC, there is no need to update
 +  * kvmclock on vcpu-cpu migration
 +  */
 + if (!vcpu-kvm-arch.use_master_clock || vcpu-cpu == -1)
 + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
   if (vcpu-cpu != cpu)
   kvm_migrate_timers(vcpu);
   vcpu-cpu = cpu;

Ok. Since you are only touching the one in kvm_arch_vcpu_load() and
leaving the others untouched, it looks correct.

Acked-by: Glauber Costa glom...@parallels.com
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] KVM: PPC: Book3S HV: Fix bug causing loss of page dirty state

2012-11-20 Thread Paul Mackerras
This fixes a bug where adding a new guest HPT entry via the H_ENTER
hcall would lose the changed bit in the reverse map information
for the guest physical page being mapped.  The result was that the
KVM_GET_DIRTY_LOG could return a zero bit for the page even though
the page had been modified by the guest.

This fixes it by only modifying the index and present bits in the
reverse map entry, thus preserving the reference and change bits.
We were also unnecessarily setting the reference bit, and this
fixes that too.

Signed-off-by: Paul Mackerras pau...@samba.org
---
This is against Alex Graf's kvm-ppc-next branch plus the series of three
patches I just sent, but it should be independent of that series.

 arch/powerpc/kvm/book3s_hv_rm_mmu.c |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 2334000..fc3da32 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -59,10 +59,10 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct 
revmap_entry *rev,
head-back = pte_index;
} else {
rev-forw = rev-back = pte_index;
-   i = pte_index;
+   *rmap = (*rmap  ~KVMPPC_RMAP_INDEX) |
+   pte_index | KVMPPC_RMAP_PRESENT;
}
-   smp_wmb();
-   *rmap = i | KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT; /* unlock */
+   unlock_rmap(rmap);
 }
 EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 00/18] pvclock vsyscall support + KVM hypervisor support (v5)

2012-11-20 Thread Glauber Costa
On 11/20/2012 01:57 AM, Marcelo Tosatti wrote:
 This patchset, based on earlier work by Jeremy Fitzhardinge, implements
 paravirtual clock vsyscall support.
 
 It should be possible to implement Xen support relatively easily.
 
 It reduces clock_gettime from 500 cycles to 200 cycles
 on my testbox.
 

There are no more significant objections from my side.
I will still try to go through it again today just in case.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: PPC: Book3S HV: Fix bug causing loss of page dirty state

2012-11-20 Thread Alexander Graf

On 20.11.2012, at 10:01, Paul Mackerras wrote:

 This fixes a bug where adding a new guest HPT entry via the H_ENTER
 hcall would lose the changed bit in the reverse map information
 for the guest physical page being mapped.  The result was that the
 KVM_GET_DIRTY_LOG could return a zero bit for the page even though
 the page had been modified by the guest.
 
 This fixes it by only modifying the index and present bits in the
 reverse map entry, thus preserving the reference and change bits.
 We were also unnecessarily setting the reference bit, and this
 fixes that too.
 
 Signed-off-by: Paul Mackerras pau...@samba.org

Thanks, applied to kvm-ppc-next.

Alex

 ---
 This is against Alex Graf's kvm-ppc-next branch plus the series of three
 patches I just sent, but it should be independent of that series.
 
 arch/powerpc/kvm/book3s_hv_rm_mmu.c |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
 
 diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
 b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
 index 2334000..fc3da32 100644
 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
 +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
 @@ -59,10 +59,10 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct 
 revmap_entry *rev,
   head-back = pte_index;
   } else {
   rev-forw = rev-back = pte_index;
 - i = pte_index;
 + *rmap = (*rmap  ~KVMPPC_RMAP_INDEX) |
 + pte_index | KVMPPC_RMAP_PRESENT;
   }
 - smp_wmb();
 - *rmap = i | KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT; /* unlock */
 + unlock_rmap(rmap);
 }
 EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
 
 -- 
 1.7.10.4
 
 --
 To unsubscribe from this list: send the line unsubscribe kvm-ppc in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 3/3] KVM: PPC: Book3S HV: Provide a method for userspace to read and write the HPT

2012-11-20 Thread Alexander Graf

On 20.11.2012, at 09:57, Paul Mackerras wrote:

 A new ioctl, KVM_PPC_GET_HTAB_FD, returns a file descriptor.  Reads on
 this fd return the contents of the HPT (hashed page table), writes
 create and/or remove entries in the HPT.  There is a new capability,
 KVM_CAP_PPC_HTAB_FD, to indicate the presence of the ioctl.  The ioctl
 takes an argument structure with the index of the first HPT entry to
 read out and a set of flags.  The flags indicate whether the user is
 intending to read or write the HPT, and whether to return all entries
 or only the bolted entries (those with the bolted bit, 0x10, set in
 the first doubleword).
 
 This is intended for use in implementing qemu's savevm/loadvm and for
 live migration.  Therefore, on reads, the first pass returns information
 about all HPTEs (or all bolted HPTEs).  When the first pass reaches the
 end of the HPT, it returns from the read.  Subsequent reads only return
 information about HPTEs that have changed since they were last read.
 A read that finds no changed HPTEs in the HPT following where the last
 read finished will return 0 bytes.
 
 The format of the data provides a simple run-length compression of the
 invalid entries.  Each block of data starts with a header that indicates
 the index (position in the HPT, which is just an array), the number of
 valid entries starting at that index (may be zero), and the number of
 invalid entries following those valid entries.  The valid entries, 16
 bytes each, follow the header.  The invalid entries are not explicitly
 represented.
 
 Signed-off-by: Paul Mackerras pau...@samba.org
 ---
 v2: added comments, added reserved field in struct kvm_get_htab_fd
 
 Documentation/virtual/kvm/api.txt|   53 +
 arch/powerpc/include/asm/kvm_book3s_64.h |   22 ++
 arch/powerpc/include/asm/kvm_ppc.h   |2 +
 arch/powerpc/include/uapi/asm/kvm.h  |   25 +++
 arch/powerpc/kvm/book3s_64_mmu_hv.c  |  344 ++
 arch/powerpc/kvm/book3s_hv.c |   12 --
 arch/powerpc/kvm/powerpc.c   |   17 ++
 include/uapi/linux/kvm.h |3 +
 8 files changed, 466 insertions(+), 12 deletions(-)
 
 diff --git a/Documentation/virtual/kvm/api.txt 
 b/Documentation/virtual/kvm/api.txt
 index 6671fdc..33080ea 100644
 --- a/Documentation/virtual/kvm/api.txt
 +++ b/Documentation/virtual/kvm/api.txt
 @@ -2071,6 +2071,59 @@ KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external 
 call; source cpu in parm
 
 Note that the vcpu ioctl is asynchronous to vcpu execution.
 
 +4.78 KVM_PPC_GET_HTAB_FD
 +
 +Capability: KVM_CAP_PPC_HTAB_FD
 +Architectures: powerpc
 +Type: vm ioctl
 +Parameters: Pointer to struct kvm_get_htab_fd (in)
 +Returns: file descriptor number (= 0) on success, -1 on error
 +
 +This returns a file descriptor that can be used either to read out the
 +entries in the guest's hashed page table (HPT), or to write entries to
 +initialize the HPT.  The returned fd can only be written to if the
 +KVM_GET_HTAB_WRITE bit is set in the flags field of the argument, and
 +can only be read if that bit is clear.  The argument struct looks like
 +this:
 +
 +/* For KVM_PPC_GET_HTAB_FD */
 +struct kvm_get_htab_fd {
 + __u64   flags;
 + __u64   start_index;

Documentation is out of sync :).

Applied all 3 with fixed documentation.


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] s390: Virtual channel subsystem support.

2012-11-20 Thread Cornelia Huck
On Mon, 19 Nov 2012 14:30:00 +0100
Alexander Graf ag...@suse.de wrote:

 
 On 31.10.2012, at 17:24, Cornelia Huck wrote:
 
  Provide a mechanism for qemu to provide fully virtual subchannels to
  the guest. In the KVM case, this relies on the kernel's css support
  for I/O and machine check interrupt handling. The !KVM case handles
  interrupts on its own.
  
  Signed-off-by: Cornelia Huck cornelia.h...@de.ibm.com
  ---
  hw/s390x/Makefile.objs |1 +
  hw/s390x/css.c | 1209 
  
  hw/s390x/css.h |   90 
  target-s390x/Makefile.objs |2 +-
  target-s390x/cpu.h |  232 +
  target-s390x/helper.c  |  146 ++
  target-s390x/ioinst.c  |  737 +++
  target-s390x/ioinst.h  |  213 
  target-s390x/kvm.c |  251 -
  target-s390x/misc_helper.c |6 +-
  10 files changed, 2872 insertions(+), 15 deletions(-)
  create mode 100644 hw/s390x/css.c
  create mode 100644 hw/s390x/css.h
  create mode 100644 target-s390x/ioinst.c
  create mode 100644 target-s390x/ioinst.h
  
  diff --git a/hw/s390x/Makefile.objs b/hw/s390x/Makefile.objs
  index 096dfcd..378b099 100644
  --- a/hw/s390x/Makefile.objs
  +++ b/hw/s390x/Makefile.objs
  @@ -4,3 +4,4 @@ obj-y := $(addprefix ../,$(obj-y))
  obj-y += sclp.o
  obj-y += event-facility.o
  obj-y += sclpquiesce.o sclpconsole.o
  +obj-y += css.o
  diff --git a/hw/s390x/css.c b/hw/s390x/css.c
  new file mode 100644
  index 000..9adffb3
  --- /dev/null
  +++ b/hw/s390x/css.c
  @@ -0,0 +1,1209 @@
  +/*
  + * Channel subsystem base support.
  + *
  + * Copyright 2012 IBM Corp.
  + * Author(s): Cornelia Huck cornelia.h...@de.ibm.com
  + *
  + * This work is licensed under the terms of the GNU GPL, version 2 or (at
  + * your option) any later version. See the COPYING file in the top-level
  + * directory.
  + */
  +
  +#include qemu-thread.h
  +#include qemu-queue.h
  +#include hw/qdev.h
  +#include bitops.h
  +#include kvm.h
  +#include cpu.h
  +#include ioinst.h
  +#include css.h
  +#include virtio-ccw.h
  +
  +typedef struct CrwContainer {
  +CRW crw;
  +QTAILQ_ENTRY(CrwContainer) sibling;
  +} CrwContainer;
  +
  +typedef struct ChpInfo {
  +uint8_t in_use;
  +uint8_t type;
  +uint8_t is_virtual;
  +} ChpInfo;
  +
  +typedef struct SubchSet {
  +SubchDev *sch[MAX_SCHID + 1];
  +unsigned long schids_used[BITS_TO_LONGS(MAX_SCHID + 1)];
  +unsigned long devnos_used[BITS_TO_LONGS(MAX_SCHID + 1)];
  +} SubchSet;
  +
  +typedef struct CssImage {
  +SubchSet *sch_set[MAX_SSID + 1];
  +ChpInfo chpids[MAX_CHPID + 1];
  +} CssImage;
  +
  +typedef struct ChannelSubSys {
  +QTAILQ_HEAD(, CrwContainer) pending_crws;
  +bool do_crw_mchk;
  +bool crws_lost;
  +uint8_t max_cssid;
  +uint8_t max_ssid;
  +bool chnmon_active;
  +uint64_t chnmon_area;
  +CssImage *css[MAX_CSSID + 1];
  +uint8_t default_cssid;
  +} ChannelSubSys;
  +
  +static ChannelSubSys *channel_subsys;
  +
  +int css_create_css_image(uint8_t cssid, bool default_image)
  +{
  +if (cssid  MAX_CSSID) {
  +return -EINVAL;
  +}
  +if (channel_subsys-css[cssid]) {
  +return -EBUSY;
  +}
  +channel_subsys-css[cssid] = g_try_malloc0(sizeof(CssImage));
  +if (!channel_subsys-css[cssid]) {
  +return -ENOMEM;
  +}
  +if (default_image) {
  +channel_subsys-default_cssid = cssid;
  +}
  +return 0;
  +}
  +
  +static void css_write_phys_pmcw(uint64_t addr, PMCW *pmcw)
  +{
  +int i;
  +uint32_t offset = 0;
  +struct copy_pmcw {
  +uint32_t intparm;
  +uint16_t flags;
  +uint16_t devno;
  +uint8_t lpm;
  +uint8_t pnom;
  +uint8_t lpum;
  +uint8_t pim;
  +uint16_t mbi;
  +uint8_t pom;
  +uint8_t pam;
  +uint8_t chpid[8];
  +uint32_t chars;
  +} *copy;
 
 This needs to be packed. Also, it might be a good idea to separate the struct 
 definition from the actual code ;).
 
  +
  +copy = (struct copy_pmcw *)pmcw;
 
 This will break on any system that doesn't coincidently stick to the same 
 bitfield order as s390x. Please drop any usage of bitfields in QEMU source 
 code :).
 
  +stl_phys(addr + offset, copy-intparm);
  +offset += sizeof(copy-intparm);
 
 Can't you just use cpu_physical_memory_map() and assign things left and right 
 as you see fit? Or prepare the target endianness struct on the stack and 
 cpu_physical_memory_read/write it from/to guest memory.

All that copying stuff (other places as well) was still on my todo list
- just wanted to get the patches out of the door so people could take a
look at the interface.

 
 Also, please split this patch into smaller patches :). As it is now it's very 
 hard to review. However, apart from the above issues (which may happen in 
 other places of the code further down, I just 

Re: [kvmarm] [PATCH v4 09/14] KVM: ARM: Emulation framework and CP15 emulation

2012-11-20 Thread Rusty Russell
Peter Maydell peter.mayd...@linaro.org writes:
 On 19 November 2012 15:01, Will Deacon will.dea...@arm.com wrote:
 On Sat, Nov 10, 2012 at 03:43:13PM +, Christoffer Dall wrote:
 +/*
 + * A15-specific CP15 registers.
 + * Important: Must be sorted ascending by CRn, CRM, Op1, Op2
 + */
 +static const struct coproc_reg a15_regs[] = {
 +   /* MPIDR: we use VMPIDR for guest access. */
 +   { CRn( 0), CRm( 0), Op1( 0), Op2( 5), is32,
 +   NULL, reset_mpidr, c0_MPIDR },
 +
 +   /* SCTLR: swapped by interrupt.S. */
 +   { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32,
 +   NULL, reset_val, c1_SCTLR, 0x00C50078 },

 Why is the SCTLR included here as an A15-specific register?

 Rusty might remember the exact answer, but probably because
 the SCTLR reset value is IMPDEF.

Indeed...

Cheers,
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 0/2] x86: clear vmcss on all cpus when doing kdump if necessary

2012-11-20 Thread zhangyanfei
于 2012年11月20日 08:32, Marcelo Tosatti 写道:
 On Fri, Nov 16, 2012 at 06:12:58PM +0800, zhangyanfei wrote:
 Hello Marcelo,

 Any thoughts?
 
 I thought a function call was OK, but its better to have all code in
 vmx.c. Please have an atomic notifier in kexec.c (registered by KVM
 module via atomic_notifier_chain_register etc).
 
 Other than that, which is largely cosmetic, it looks fine.
 Sorry for not expressing this earlier.
 
 

Hmm, Thanks. I will resend a new patch set.

Thanks
Zhang
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v11] kvm: notify host when the guest is panicked

2012-11-20 Thread Hu Tao
Hi Marcelo,

On Tue, Nov 13, 2012 at 12:19:08AM -0200, Marcelo Tosatti wrote:
 On Fri, Nov 09, 2012 at 03:17:39PM -0500, Sasha Levin wrote:
  On Mon, Nov 5, 2012 at 8:58 PM, Hu Tao hu...@cn.fujitsu.com wrote:
   But in the case of panic notification, more dependency means more
   chances of failure of panic notification. Say, if we use a virtio device
   to do panic notification, then we will fail if: virtio itself has
   problems, virtio for some reason can't be deployed(neither built-in or
   as a module), or guest doesn't support virtio, etc.
  
  Add polling to your virtio device. If it didn't notify of a panic but
  taking more than 20 sec to answer your poll request you can assume
  it's dead.
  
  Actually, just use virtio-serial and something in userspace on the guest.
 
 They want the guest to stop, so a memory dump can be taken by management
 interface.
 
 Hu Tao, lets assume port I/O is the preferred method for communication.

Okey.

 Now, the following comments have still not been addressed:
 
 1) Lifecycle of the stopped guest and interaction with other stopped
 states in QEMU.

Patch 3 already deals with run state transitions. But in case I'm
missing something, could you be more specific?

 
 2) Format of the interface for other architectures (you can choose
 a different KVM supported architecture and write an example).
 
 3) Clear/documented management interface for the feature.

It is documented in patch 0: Documentation/virtual/kvm/pv_event.txt.
Does it need to be improved?
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Re: Re: [RFC PATCH 0/2] kvm/vmx: Output TSC offset

2012-11-20 Thread Yoshihiro YUNOMAE

Hi Marcelo,

Sorry for the late reply.

(2012/11/17 4:15), Marcelo Tosatti wrote:

On Wed, Nov 14, 2012 at 05:26:10PM +0900, Yoshihiro YUNOMAE wrote:

Thank you for commenting on my patch set.

(2012/11/14 11:31), Steven Rostedt wrote:

On Tue, 2012-11-13 at 18:03 -0800, David Sharp wrote:

On Tue, Nov 13, 2012 at 6:00 PM, Steven Rostedt rost...@goodmis.org wrote:

On Wed, 2012-11-14 at 10:36 +0900, Yoshihiro YUNOMAE wrote:


To merge the data like previous pattern, we apply this patch set. Then, we can
get TSC offset of the guest as follows:

$ dmesg | grep kvm
[   57.717180] kvm: (2687) write TSC offset 18446743360465545001, now clock ##
     |
  PID TSC offset |
HOST TSC value --+



Using printk to export something like this is IMO a nasty hack.

Can't we create a /sys or /proc file to export the same thing?


Since the value changes over the course of the trace, and seems to be
part of the context of the trace, I think I'd include it as a
tracepoint.



I'm fine with that too.


Using some tracepoint is a nice idea, but there is one problem. Here,
our discussion point is the event which TSC offset is changed does not
frequently occur, but the buffer must keep the event data.

There are two ideas for using tracepoint. First, we define new
tracepoint for changed TSC offset. This is simple and the overhead will
be low. However, this trace event stored in the buffer will be
overwritten by other trace events because this TSC offset event does
not frequently occur. Second, we add TSC offset information to the
tracepoint frequently occured. For example, we assume that TSC offset
information is added to arguments of trace_kvm_exit().


The TSC offset is in the host trace. So given a host trace with two TSC
offset updates, how do you know which events in the guest trace
(containing a number of events) refer to which tsc offset update?

Unless i am missing something, you can't solve this easily (well, except
exporting information to the guest that allows it to transform RDTSC -
host TSC value, which can be done via pvclock).


As you say, TSC offset events are in the host trace, but we don't need
to notify guests of updating TSC offset. The offset event will output
the next TSC offset value and the current TSC value, so we can
calculate the guest TSC (T1) for the event. Guest TSCs since T1 can be
converted to host TSC using the TSC offset, so we can integrate those
trace data.


Another issue as mentioned is lack of TSC synchronization in the host.
Should you provide such a feature without the possibility of proper
chronological order on systems with unsynchronized TSC?


I think, we cannot support this sorting feature using TSC on systems
with unsynchronized TSC. On systems with unsynchronized TSC, it is
difficult to sort not only trace data of guests and the host but trace
data of a guest or a host using TSC in chronological order. Actually,
if we want to output tracing data of ftrace in chronological order with
unsynchronized TSC, we will use the global mode as the timestamp. The
global mode uses wallclock added TSC correction, so the mode guarantees
to sort in chronological order for trace data of the guest or of
the host. If we use this mode to sort the trace data of guests and the
host in chronological order, we need to consider about the difference
between the guest and the host and timekeeping of guests and the host,
so it is difficult to solve these issues. At least, I haven't came up
with the good solution.

We cannot sort the trace data of guests and the host in chronological
order with unsynchronized TSC, but if we can set following
synchronization events for both guests and the host, we will know where
we should sort.

First, a guest and the host uses the global mode as the timestamp of
ftrace. Next, a user on the guest writes 1 to the synchronization I/F
as the ID, then the synchronization event 1 is recorded in a
ring-buffer of the guest. The synchronization operation induces
hypercall, so the host can handle the event. After the operation moves
to the host, the host records the event 1 in a ring-buffer of the
host. In the end, the operation returns to the host, and the
synchronization is finished.

When we integrate tracing data of the guest and the host, we
calculate difference of the timestamp between the synchronizing events
with the same ID. This value is a temporary offset. We will convert
the timestamp of the guests to the timestamp of the host before the
next synchronizing event. If the synchronizing event cycle is very
short, we will not need to consider the timekeeping. Then, we can sort
the trace data in chronological order.

Would you comment for this or do you have another idea?

Thanks,
--
Yoshihiro YUNOMAE
Software Platform Research Dept. Linux Technology Center
Hitachi, Ltd., Yokohama 

Re: Re: Re: [RFC PATCH 0/2] kvm/vmx: Output TSC offset

2012-11-20 Thread Yoshihiro YUNOMAE

Hi Steven,

Sorry for the late reply.

(2012/11/17 0:05), Steven Rostedt wrote:

On Wed, 2012-11-14 at 17:26 +0900, Yoshihiro YUNOMAE wrote:

Thank you for commenting on my patch set.

(2012/11/14 11:31), Steven Rostedt wrote:

On Tue, 2012-11-13 at 18:03 -0800, David Sharp wrote:

On Tue, Nov 13, 2012 at 6:00 PM, Steven Rostedt rost...@goodmis.org wrote:

On Wed, 2012-11-14 at 10:36 +0900, Yoshihiro YUNOMAE wrote:


To merge the data like previous pattern, we apply this patch set. Then, we can
get TSC offset of the guest as follows:

$ dmesg | grep kvm
[   57.717180] kvm: (2687) write TSC offset 18446743360465545001, now clock ##
      |
   PID TSC offset |
 HOST TSC value --+



Using printk to export something like this is IMO a nasty hack.

Can't we create a /sys or /proc file to export the same thing?


Since the value changes over the course of the trace, and seems to be
part of the context of the trace, I think I'd include it as a
tracepoint.



I'm fine with that too.


Using some tracepoint is a nice idea, but there is one problem. Here,
our discussion point is the event which TSC offset is changed does not
frequently occur, but the buffer must keep the event data.


If you can hold off a bit, for the 3.9 window, I plan on pushing
multiple buffers for ftrace. That is, you can create a separate buffer
just for the TSC offset events:

cd /sys/kernel/debug
echo tsc  instances/new
echo 1  instances/tsc/events/tsc/offset/enable

Then the buffer will be used only for that event.


That's good. The tracepoint will output as follows:

qemu-kvm-12345  [000] 123456789: kvm_write_tsc_offset:
now_tsc=123456789 previous_offset=0 next_offset=123456780

Thanks,
--
Yoshihiro YUNOMAE
Software Platform Research Dept. Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: yoshihiro.yunomae...@hitachi.com


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 3/3] KVM: PPC: Book3S HV: Provide a method for userspace to read and write the HPT

2012-11-20 Thread Paul Mackerras
On Tue, Nov 20, 2012 at 10:16:24AM +0100, Alexander Graf wrote:

 Documentation is out of sync :).

Oops, sorry... :)

 Applied all 3 with fixed documentation.

Great, thanks.

Regards,
Paul.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] KVM: x86: clean up reexecute_instruction

2012-11-20 Thread Gleb Natapov
On Tue, Nov 20, 2012 at 07:58:32AM +0800, Xiao Guangrong wrote:
 Little cleanup for reexecute_instruction, also use gpa_to_gfn in
 retry_instruction
 
 Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
 ---
  arch/x86/kvm/x86.c |   13 ++---
  1 files changed, 6 insertions(+), 7 deletions(-)
 
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index 52ae8b5..7be8452 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -4477,19 +4477,18 @@ static bool reexecute_instruction(struct kvm_vcpu 
 *vcpu, gva_t gva)
   if (tdp_enabled)
   return false;
 
 + gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
 + if (gpa == UNMAPPED_GVA)
 + return true; /* let cpu generate fault */
 +
   /*
* if emulation was due to access to shadowed page table
* and it failed try to unshadow page and re-enter the
* guest to let CPU execute the instruction.
*/
 - if (kvm_mmu_unprotect_page_virt(vcpu, gva))
 + if (kvm_mmu_unprotect_page(vcpu-kvm, c(gpa)))
What's c()? Should be gpa_to_gfn(gpa)?

   return true;
 
 - gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
 -
 - if (gpa == UNMAPPED_GVA)
 - return true; /* let cpu generate fault */
 -
   /*
* Do not retry the unhandleable instruction if it faults on the
* readonly host memory, otherwise it will goto a infinite loop:
 @@ -4544,7 +4543,7 @@ static bool retry_instruction(struct x86_emulate_ctxt 
 *ctxt,
   if (!vcpu-arch.mmu.direct_map)
   gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
 
 - kvm_mmu_unprotect_page(vcpu-kvm, gpa  PAGE_SHIFT);
 + kvm_mmu_unprotect_page(vcpu-kvm, gpa_to_gfn(gpa));
 
   return true;
  }
 -- 
 1.7.7.6
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-kernel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 Please read the FAQ at  http://www.tux.org/lkml/

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] KVM: Retire as maintainer

2012-11-20 Thread Avi Kivity
After six and a half years of writing and maintaining KVM, it is time to
move to new things.  Update my MAINTAINERS entry to reflect that.

Signed-off-by: Avi Kivity a...@redhat.com
---
 CREDITS | 5 +
 MAINTAINERS | 1 -
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/CREDITS b/CREDITS
index d8fe12a..2346b09 100644
--- a/CREDITS
+++ b/CREDITS
@@ -1823,6 +1823,11 @@ S: Kattreinstr 38
 S: D-64295
 S: Germany
 
+N: Avi Kivity
+E: avi.kiv...@gmail.com
+D: Kernel-based Virtual Machine (KVM)
+S: Ra'annana, Israel
+
 N: Andi Kleen
 E: a...@firstfloor.org
 U: http://www.halobates.de
diff --git a/MAINTAINERS b/MAINTAINERS
index bb0b27d..6b5b0b7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4280,7 +4280,6 @@ F:include/linux/lockd/
 F: include/linux/sunrpc/
 
 KERNEL VIRTUAL MACHINE (KVM)
-M: Avi Kivity a...@redhat.com
 M: Marcelo Tosatti mtosa...@redhat.com
 L: kvm@vger.kernel.org
 W: http://kvm.qumranet.com
-- 
1.7.12

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: Retire as maintainer

2012-11-20 Thread Avi Kivity
On 11/20/2012 02:26 PM, Avi Kivity wrote:
 After six and a half years of writing and maintaining KVM, it is time to
 move to new things.  Update my MAINTAINERS entry to reflect that.

Gleb, please send an incremental patch (against kvm.git master)
installing yourself as a new co-maintainer.


-- 
error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] KVM: taking co-maintenance

2012-11-20 Thread Gleb Natapov
Updating MAINTAINERS file.

Signed-off-by: Gleb Natapov g...@redhat.com
diff --git a/MAINTAINERS b/MAINTAINERS
index 6b5b0b7..b273360 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4281,6 +4281,7 @@ F:include/linux/sunrpc/
 
 KERNEL VIRTUAL MACHINE (KVM)
 M: Marcelo Tosatti mtosa...@redhat.com
+M: Gleb Natapov g...@redhat.com
 L: kvm@vger.kernel.org
 W: http://kvm.qumranet.com
 S: Supported
--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: taking co-maintenance

2012-11-20 Thread Avi Kivity
On 11/20/2012 02:37 PM, Gleb Natapov wrote:
 Updating MAINTAINERS file.

Applied, thanks.


-- 
error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[GIT PULL] KVM maintainership update

2012-11-20 Thread Avi Kivity
Linus, please pull from:

  git://git.kernel.org/pub/scm/virt/kvm/kvm.git master

to receive this MAINTAINERS update.  After many years of maintaining KVM,
I am moving on.  It was a real pleasure for me to work with so many talented
and dedicated hackers on this project.

Replacing me will be one of those talented and dedicated hackers, Gleb, who
has authored hundreds of patches in and around KVM.



Avi Kivity (1):
  KVM: Retire as maintainer

Gleb Natapov (1):
  KVM: taking co-maintenance

 CREDITS | 5 +
 MAINTAINERS | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/CREDITS b/CREDITS
index d8fe12a..2346b09 100644
--- a/CREDITS
+++ b/CREDITS
@@ -1823,6 +1823,11 @@ S: Kattreinstr 38
 S: D-64295
 S: Germany
 
+N: Avi Kivity
+E: avi.kiv...@gmail.com
+D: Kernel-based Virtual Machine (KVM)
+S: Ra'annana, Israel
+
 N: Andi Kleen
 E: a...@firstfloor.org
 U: http://www.halobates.de
diff --git a/MAINTAINERS b/MAINTAINERS
index bb0b27d..b273360 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4280,8 +4280,8 @@ F:include/linux/lockd/
 F: include/linux/sunrpc/
 
 KERNEL VIRTUAL MACHINE (KVM)
-M: Avi Kivity a...@redhat.com
 M: Marcelo Tosatti mtosa...@redhat.com
+M: Gleb Natapov g...@redhat.com
 L: kvm@vger.kernel.org
 W: http://kvm.qumranet.com
 S: Supported

-- 
error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v4 1/8] linux-headers: Add ARM KVM headers (not for upstream)

2012-11-20 Thread Peter Maydell
This commit adds the ARM KVM headers. This is not to go to QEMU
upstream -- the correct path there is that the KVM code will be
committed to a mainline upstream kernel, and then upstream QEMU
can do a bulk header update from the upstream kernel, which will
allow us to drop this temporary commit.

This is the result of running update-headers on Christoffer's
kvm-arm-v14-vgic-timers branch (commit 68d116f).

This commit currently also includes some non-ARM header changes
which hopefully will have made it into QEMU upstream by the time
we submit this for merging.
---
 linux-headers/asm-arm/kvm.h  |  137 ++
 linux-headers/asm-arm/kvm_para.h |1 +
 linux-headers/asm-generic/kvm_para.h |4 +
 linux-headers/asm-powerpc/kvm.h  |   59 +++
 linux-headers/asm-powerpc/kvm_para.h |7 +-
 linux-headers/linux/kvm.h|   34 +++--
 6 files changed, 234 insertions(+), 8 deletions(-)
 create mode 100644 linux-headers/asm-arm/kvm.h
 create mode 100644 linux-headers/asm-arm/kvm_para.h
 create mode 100644 linux-headers/asm-generic/kvm_para.h

diff --git a/linux-headers/asm-arm/kvm.h b/linux-headers/asm-arm/kvm.h
new file mode 100644
index 000..b1c7871
--- /dev/null
+++ b/linux-headers/asm-arm/kvm.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall c.d...@virtualopensystems.com
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#ifndef __ARM_KVM_H__
+#define __ARM_KVM_H__
+
+#include asm/types.h
+#include asm/ptrace.h
+
+#define __KVM_HAVE_GUEST_DEBUG
+#define __KVM_HAVE_IRQ_LINE
+
+#define KVM_REG_SIZE(id)   \
+   (1U  (((id)  KVM_REG_SIZE_MASK)  KVM_REG_SIZE_SHIFT))
+
+struct kvm_regs {
+   struct pt_regs usr_regs;/* R0_usr - R14_usr, PC, CPSR */
+   __u32 svc_regs[3];  /* SP_svc, LR_svc, SPSR_svc */
+   __u32 abt_regs[3];  /* SP_abt, LR_abt, SPSR_abt */
+   __u32 und_regs[3];  /* SP_und, LR_und, SPSR_und */
+   __u32 irq_regs[3];  /* SP_irq, LR_irq, SPSR_irq */
+   __u32 fiq_regs[8];  /* R8_fiq - R14_fiq, SPSR_fiq */
+};
+
+/* Supported Processor Types */
+#define KVM_ARM_TARGET_CORTEX_A15  0
+#define KVM_ARM_NUM_TARGETS1
+
+/* KVM_SET_DEVICE_ADDRESS ioctl id encoding */
+#define KVM_DEVICE_TYPE_SHIFT  0
+#define KVM_DEVICE_TYPE_MASK   (0x  KVM_DEVICE_TYPE_SHIFT)
+#define KVM_DEVICE_ID_SHIFT16
+#define KVM_DEVICE_ID_MASK (0x  KVM_DEVICE_ID_SHIFT)
+
+/* Supported device IDs */
+#define KVM_ARM_DEVICE_VGIC_V2 0
+
+/* Supported VGIC address types  */
+#define KVM_VGIC_V2_ADDR_TYPE_DIST 0
+#define KVM_VGIC_V2_ADDR_TYPE_CPU  1
+
+struct kvm_vcpu_init {
+   __u32 target;
+   __u32 features[7];
+};
+
+struct kvm_sregs {
+};
+
+struct kvm_fpu {
+};
+
+struct kvm_guest_debug_arch {
+};
+
+struct kvm_debug_exit_arch {
+};
+
+struct kvm_sync_regs {
+};
+
+struct kvm_arch_memory_slot {
+};
+
+/* If you need to interpret the index values, here is the key: */
+#define KVM_REG_ARM_COPROC_MASK0x0FFF
+#define KVM_REG_ARM_COPROC_SHIFT   16
+#define KVM_REG_ARM_32_OPC2_MASK   0x0007
+#define KVM_REG_ARM_32_OPC2_SHIFT  0
+#define KVM_REG_ARM_OPC1_MASK  0x0078
+#define KVM_REG_ARM_OPC1_SHIFT 3
+#define KVM_REG_ARM_CRM_MASK   0x0780
+#define KVM_REG_ARM_CRM_SHIFT  7
+#define KVM_REG_ARM_32_CRN_MASK0x7800
+#define KVM_REG_ARM_32_CRN_SHIFT   11
+
+/* Normal registers are mapped as coprocessor 16. */
+#define KVM_REG_ARM_CORE   (0x0010  KVM_REG_ARM_COPROC_SHIFT)
+#define KVM_REG_ARM_CORE_REG(name) (offsetof(struct kvm_regs, name) / 4)
+
+/* Some registers need more space to represent values. */
+#define KVM_REG_ARM_DEMUX  (0x0011  KVM_REG_ARM_COPROC_SHIFT)
+#define KVM_REG_ARM_DEMUX_ID_MASK  0xFF00
+#define KVM_REG_ARM_DEMUX_ID_SHIFT 8
+#define KVM_REG_ARM_DEMUX_ID_CCSIDR(0x00  KVM_REG_ARM_DEMUX_ID_SHIFT)
+#define KVM_REG_ARM_DEMUX_VAL_MASK 0x00FF
+#define KVM_REG_ARM_DEMUX_VAL_SHIFT0
+
+/* VFP registers: we could overload CP10 like ARM does, but that's ugly. */
+#define KVM_REG_ARM_VFP  

[RFC v4 8/8] oslib-posix: Align to permit transparent hugepages on ARM Linux

2012-11-20 Thread Peter Maydell
ARM Linux (like x86-64 Linux) can use transparent hugepages for
KVM if memory blocks are 2MiB aligned; set QEMU_VMALLOC_ALIGN
accordingly.

Signed-off-by: Peter Maydell peter.mayd...@linaro.org
---
 oslib-posix.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/oslib-posix.c b/oslib-posix.c
index 9db9c3d..d25b52a 100644
--- a/oslib-posix.c
+++ b/oslib-posix.c
@@ -35,7 +35,7 @@
 extern int daemon(int, int);
 #endif
 
-#if defined(__linux__)  defined(__x86_64__)
+#if defined(__linux__)  (defined(__x86_64__) || defined(__arm__))
/* Use 2 MiB alignment so transparent hugepages can be used by KVM.
   Valgrind does not support alignments larger than 1 MiB,
   therefore we need special code which handles running on Valgrind. */
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v4 6/8] hw/kvm/arm_gic: Implement support for KVM in-kernel ARM GIC

2012-11-20 Thread Peter Maydell
Implement support for using the KVM in-kernel GIC for ARM.

Signed-off-by: Peter Maydell peter.mayd...@linaro.org
---
 hw/a15mpcore.c   |8 ++-
 hw/arm/Makefile.objs |1 +
 hw/kvm/arm_gic.c |  169 ++
 3 files changed, 177 insertions(+), 1 deletion(-)
 create mode 100644 hw/kvm/arm_gic.c

diff --git a/hw/a15mpcore.c b/hw/a15mpcore.c
index fc0a02a..31158f9 100644
--- a/hw/a15mpcore.c
+++ b/hw/a15mpcore.c
@@ -19,6 +19,7 @@
  */
 
 #include sysbus.h
+#include kvm.h
 
 /* A15MP private memory region.  */
 
@@ -40,8 +41,13 @@ static int a15mp_priv_init(SysBusDevice *dev)
 {
 A15MPPrivState *s = FROM_SYSBUS(A15MPPrivState, dev);
 SysBusDevice *busdev;
+const char *gictype = arm-gic;
 
-s-gic = qdev_create(NULL, arm_gic);
+if (kvm_irqchip_in_kernel()) {
+gictype = kvm-arm-gic;
+}
+
+s-gic = qdev_create(NULL, gictype);
 qdev_prop_set_uint32(s-gic, num-cpu, s-num_cpu);
 qdev_prop_set_uint32(s-gic, num-irq, s-num_irq);
 qdev_prop_set_uint32(s-gic, revision, 2);
diff --git a/hw/arm/Makefile.objs b/hw/arm/Makefile.objs
index 6d049e7..38b10a8 100644
--- a/hw/arm/Makefile.objs
+++ b/hw/arm/Makefile.objs
@@ -31,5 +31,6 @@ obj-y += collie.o
 obj-y += imx_serial.o imx_ccm.o imx_timer.o imx_avic.o
 obj-y += kzm.o
 obj-$(CONFIG_FDT) += ../device_tree.o
+obj-$(CONFIG_KVM) += kvm/arm_gic.o
 
 obj-y := $(addprefix ../,$(obj-y))
diff --git a/hw/kvm/arm_gic.c b/hw/kvm/arm_gic.c
new file mode 100644
index 000..0ad1b8b
--- /dev/null
+++ b/hw/kvm/arm_gic.c
@@ -0,0 +1,169 @@
+/*
+ * ARM Generic Interrupt Controller using KVM in-kernel support
+ *
+ * Copyright (c) 2012 Linaro Limited
+ * Written by Peter Maydell
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see http://www.gnu.org/licenses/.
+ */
+
+#include hw/sysbus.h
+#include kvm.h
+#include kvm_arm.h
+#include hw/arm_gic_internal.h
+
+#define TYPE_KVM_ARM_GIC kvm-arm-gic
+#define KVM_ARM_GIC(obj) \
+ OBJECT_CHECK(GICState, (obj), TYPE_KVM_ARM_GIC)
+#define KVM_ARM_GIC_CLASS(klass) \
+ OBJECT_CLASS_CHECK(KVMARMGICClass, (klass), TYPE_KVM_ARM_GIC)
+#define KVM_ARM_GIC_GET_CLASS(obj) \
+ OBJECT_GET_CLASS(KVMARMGICClass, (obj), TYPE_KVM_ARM_GIC)
+
+typedef struct KVMARMGICClass {
+ARMGICCommonClass parent_class;
+int (*parent_init)(SysBusDevice *dev);
+void (*parent_reset)(DeviceState *dev);
+} KVMARMGICClass;
+
+static void kvm_arm_gic_set_irq(void *opaque, int irq, int level)
+{
+/* Meaning of the 'irq' parameter:
+ *  [0..N-1] : external interrupts
+ *  [N..N+31] : PPI (internal) interrupts for CPU 0
+ *  [N+32..N+63] : PPI (internal interrupts for CPU 1
+ *  ...
+ * Convert this to the kernel's desired encoding, which
+ * has separate fields in the irq number for type,
+ * CPU number and interrupt number.
+ */
+GICState *s = (GICState *)opaque;
+int kvm_irq, irqtype, cpu;
+
+if (irq  (s-num_irq - GIC_INTERNAL)) {
+/* External interrupt. The kernel numbers these like the GIC
+ * hardware, with external interrupt IDs starting after the
+ * internal ones.
+ */
+irqtype = KVM_ARM_IRQ_TYPE_SPI;
+cpu = 0;
+irq += GIC_INTERNAL;
+} else {
+/* Internal interrupt: decode into (cpu, interrupt id) */
+irqtype = KVM_ARM_IRQ_TYPE_PPI;
+irq -= (s-num_irq - GIC_INTERNAL);
+cpu = irq / GIC_INTERNAL;
+irq %= GIC_INTERNAL;
+}
+kvm_irq = (irqtype  KVM_ARM_IRQ_TYPE_SHIFT)
+| (cpu  KVM_ARM_IRQ_VCPU_SHIFT) | irq;
+
+kvm_set_irq(kvm_state, kvm_irq, !!level);
+}
+
+static void kvm_arm_gic_put(GICState *s)
+{
+/* TODO: there isn't currently a kernel interface to set the GIC state */
+}
+
+static void kvm_arm_gic_get(GICState *s)
+{
+/* TODO: there isn't currently a kernel interface to get the GIC state */
+}
+
+static void kvm_arm_gic_reset(DeviceState *dev)
+{
+GICState *s = ARM_GIC_COMMON(dev);
+KVMARMGICClass *kgc = KVM_ARM_GIC_GET_CLASS(s);
+kgc-parent_reset(dev);
+kvm_arm_gic_put(s);
+}
+
+static int kvm_arm_gic_init(SysBusDevice *dev)
+{
+/* Device instance init function for the GIC sysbus device */
+int i;
+GICState *s = FROM_SYSBUS(GICState, dev);
+KVMARMGICClass *kgc = KVM_ARM_GIC_GET_CLASS(s);
+
+kgc-parent_init(dev);
+
+i = s-num_irq - GIC_INTERNAL;
+/* For the GIC, also 

[RFC v4 0/8] QEMU: Support KVM on ARM

2012-11-20 Thread Peter Maydell
Round 4 of the QEMU patches to support KVM for
ARM on Cortex-A15 hardware. It's intended for use with
the kernel tree at
 git://github.com/virtualopensystems/linux-kvm-arm.git kvm-arm-v14-vgic-timers

Still RFC pending the kernel patches actually being accepted
upstream...

Changes v3 to v4:
 * minor updates to match kernel ABI changes (ID field in
   kvm_device_address is now 64 bits, core register offsets now
   changed due to use of pt_regs struct)
 * squashed the two 'update kernel headers' patches, since the
   plan is for vgic support to go upstream at the same time as
   the baseline kernel patchset
 * added a new patch 8 which adds ARM to the list of Linux archs
   which prefer 2MB alignment so they can use transparent hugepages

Changes v2 to v3:
 * applied various minor tweaks suggested during review of v2
 * rebased on master, resynced with kernel headers for v13
 * new patch 6 which uses a MemoryListener to track where the
   VGIC memory regions are mapped, so we can tell the kernel
   where they live in the memory map (via new ioctl
   KVM_SET_DEVICE_ADDRESS)

Git tree available at
 git://git.linaro.org/people/pmaydell/qemu-arm.git kvm-arm-v14
with pointy-clicky interface at
 
http://git.linaro.org/gitweb?p=people/pmaydell/qemu-arm.git;a=shortlog;h=refs/heads/kvm-arm-v14


Christoffer Dall (1):
  ARM: KVM: Add support for KVM on ARM architecture

Peter Maydell (7):
  linux-headers: Add ARM KVM headers (not for upstream)
  ARM KVM: save and load VFP registers from kernel
  hw/arm_gic: Add presave/postload hooks
  target-arm: Use MemoryListener to identify GIC base address for KVM
  hw/kvm/arm_gic: Implement support for KVM in-kernel ARM GIC
  configure: Enable KVM on ARM
  oslib-posix: Align to permit transparent hugepages on ARM Linux

 configure|2 +-
 hw/a15mpcore.c   |8 +-
 hw/arm/Makefile.objs |1 +
 hw/arm_gic_common.c  |   10 +
 hw/arm_gic_internal.h|2 +
 hw/arm_pic.c |   26 ++
 hw/kvm/arm_gic.c |  169 
 linux-headers/asm-arm/kvm.h  |  137 ++
 linux-headers/asm-arm/kvm_para.h |1 +
 linux-headers/asm-generic/kvm_para.h |4 +
 linux-headers/asm-powerpc/kvm.h  |   59 +
 linux-headers/asm-powerpc/kvm_para.h |7 +-
 linux-headers/linux/kvm.h|   34 ++-
 oslib-posix.c|2 +-
 target-arm/Makefile.objs |1 +
 target-arm/cpu.h |1 +
 target-arm/helper.c  |2 +-
 target-arm/kvm.c |  482 ++
 target-arm/kvm_arm.h |   32 +++
 19 files changed, 968 insertions(+), 12 deletions(-)
 create mode 100644 hw/kvm/arm_gic.c
 create mode 100644 linux-headers/asm-arm/kvm.h
 create mode 100644 linux-headers/asm-arm/kvm_para.h
 create mode 100644 linux-headers/asm-generic/kvm_para.h
 create mode 100644 target-arm/kvm.c
 create mode 100644 target-arm/kvm_arm.h

-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] vhost-blk: Add vhost-blk support v5

2012-11-20 Thread Michael S. Tsirkin
On Tue, Nov 20, 2012 at 02:39:40PM +0800, Asias He wrote:
 On 11/20/2012 04:26 AM, Michael S. Tsirkin wrote:
  On Mon, Nov 19, 2012 at 04:53:42PM +0800, Asias He wrote:
  vhost-blk is an in-kernel virito-blk device accelerator.
 
  Due to lack of proper in-kernel AIO interface, this version converts
  guest's I/O request to bio and use submit_bio() to submit I/O directly.
  So this version any supports raw block device as guest's disk image,
  e.g. /dev/sda, /dev/ram0. We can add file based image support to
  vhost-blk once we have in-kernel AIO interface. There are some work in
  progress for in-kernel AIO interface from Dave Kleikamp and Zach Brown:
 
 http://marc.info/?l=linux-fsdevelm=133312234313122
 
  Performance evaluation:
  -
  1) LKVM
  Fio with libaio ioengine on Fusion IO device using kvm tool
  IOPS(k)Before   After   Improvement
  seq-read   107  121 +13.0%
  seq-write  130  179 +37.6%
  rnd-read   102  122 +19.6%
  rnd-write  125  159 +27.0%
 
  2) QEMU
  Fio with libaio ioengine on Fusion IO device using QEMU
  IOPS(k)Before   After   Improvement
  seq-read   76   123 +61.8%
  seq-write  139  173 +24.4%
  rnd-read   73   120 +64.3%
  rnd-write  75   156 +108.0%
  
  Could you compare with dataplane qemu as well please?
 
 
 Well, I will try to collect it.
 
  
 
  Userspace bits:
  -
  1) LKVM
  The latest vhost-blk userspace bits for kvm tool can be found here:
  g...@github.com:asias/linux-kvm.git blk.vhost-blk
 
  2) QEMU
  The latest vhost-blk userspace prototype for QEMU can be found here:
  g...@github.com:asias/qemu.git blk.vhost-blk
 
  Changes in v5:
  - Do not assume the buffer layout
  - Fix wakeup race
 
  Changes in v4:
  - Mark req-status as userspace pointer
  - Use __copy_to_user() instead of copy_to_user() in vhost_blk_set_status()
  - Add if (need_resched()) schedule() in blk thread
  - Kill vhost_blk_stop_vq() and move it into vhost_blk_stop()
  - Use vq_err() instead of pr_warn()
  - Fail un Unsupported request
  - Add flush in vhost_blk_set_features()
 
  Changes in v3:
  - Sending REQ_FLUSH bio instead of vfs_fsync, thanks Christoph!
  - Check file passed by user is a raw block device file
 
  Signed-off-by: Asias He as...@redhat.com
  
  Since there are files shared by this and vhost net
  it's easiest for me to merge this all through the
  vhost tree.
  
  Jens, could you ack this and the bio usage in this driver
  please?
  
  ---
   drivers/vhost/Kconfig |   1 +
   drivers/vhost/Kconfig.blk |  10 +
   drivers/vhost/Makefile|   2 +
   drivers/vhost/blk.c   | 697 
  ++
   drivers/vhost/blk.h   |   8 +
   5 files changed, 718 insertions(+)
   create mode 100644 drivers/vhost/Kconfig.blk
   create mode 100644 drivers/vhost/blk.c
   create mode 100644 drivers/vhost/blk.h
 
  diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
  index 202bba6..acd8038 100644
  --- a/drivers/vhost/Kconfig
  +++ b/drivers/vhost/Kconfig
  @@ -11,4 +11,5 @@ config VHOST_NET
   
   if STAGING
   source drivers/vhost/Kconfig.tcm
  +source drivers/vhost/Kconfig.blk
   endif
  diff --git a/drivers/vhost/Kconfig.blk b/drivers/vhost/Kconfig.blk
  new file mode 100644
  index 000..ff8ab76
  --- /dev/null
  +++ b/drivers/vhost/Kconfig.blk
  @@ -0,0 +1,10 @@
  +config VHOST_BLK
  +  tristate Host kernel accelerator for virtio blk (EXPERIMENTAL)
  +  depends on BLOCK   EXPERIMENTAL  m
  +  ---help---
  +This kernel module can be loaded in host kernel to accelerate
  +guest block with virtio_blk. Not to be confused with virtio_blk
  +module itself which needs to be loaded in guest kernel.
  +
  +To compile this driver as a module, choose M here: the module will
  +be called vhost_blk.
  diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
  index a27b053..1a8a4a5 100644
  --- a/drivers/vhost/Makefile
  +++ b/drivers/vhost/Makefile
  @@ -2,3 +2,5 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o
   vhost_net-y := vhost.o net.o
   
   obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o
  +obj-$(CONFIG_VHOST_BLK) += vhost_blk.o
  +vhost_blk-y := blk.o
  diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c
  new file mode 100644
  index 000..f0f118a
  --- /dev/null
  +++ b/drivers/vhost/blk.c
  @@ -0,0 +1,697 @@
  +/*
  + * Copyright (C) 2011 Taobao, Inc.
  + * Author: Liu Yuan tailai...@taobao.com
  + *
  + * Copyright (C) 2012 Red Hat, Inc.
  + * Author: Asias He as...@redhat.com
  + *
  + * This work is licensed under the terms of the GNU GPL, version 2.
  + *
  + * virtio-blk server in host kernel.
  + */
  +
  +#include linux/miscdevice.h
  +#include linux/module.h
  +#include linux/vhost.h
  +#include linux/virtio_blk.h
  +#include linux/mutex.h
  +#include linux/file.h
  +#include linux/kthread.h
  +#include linux/blkdev.h
  +#include 

Re: [PATCH 0/4] AER-KVM: Error containment of PCI pass-thru devices assigned to KVM guests

2012-11-20 Thread Stefan Hajnoczi
On Tue, Nov 20, 2012 at 06:31:48AM +, Pandarathil, Vijaymohan R wrote:
 Add support for error containment when a PCI pass-thru device assigned to a 
 KVM
 guest encounters an error. This is for PCIe devices/drivers that support AER
 functionality. When the OS is notified of an error in a device either
 through the firmware first approach or through an interrupt handled by the AER
 root port driver, concerned subsystems are notified by invoking callbacks
 registered by these subsystems. The device is also marked as tainted till the
 corresponding driver recovery routines are successful. 
 
 KVM module registers for a notification of such errors. In the KVM callback
 routine, a global counter is incremented to keep track of the error
 notification. Before each CPU enters guest mode to execute guest code,
 appropriate checks are done to see if the impacted device belongs to the guest
 or not. If the device belongs to the guest, qemu hypervisor for the guest is
 informed and the guest is immediately brought down, thus preventing or
 minimizing chances of any bad data being written out by the guest driver
 after the device has encountered an error.

I'm surprised that the hypervisor would shut down the guest when PCIe
AER kicks in for a pass-through device.  Shouldn't we pass the AER event
into the guest and deal with it there?

The equivalent to this policy on physical hardware would be that the CPU
is reset or the machine is powered down on AER.  That doesn't sound
right.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvmarm] [RFC v4 8/8] oslib-posix: Align to permit transparent hugepages on ARM Linux

2012-11-20 Thread Alexander Graf

On 11/20/2012 02:31 PM, Peter Maydell wrote:

ARM Linux (like x86-64 Linux) can use transparent hugepages for
KVM if memory blocks are 2MiB aligned; set QEMU_VMALLOC_ALIGN
accordingly.

Signed-off-by: Peter Maydellpeter.mayd...@linaro.org
---
  oslib-posix.c |2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/oslib-posix.c b/oslib-posix.c
index 9db9c3d..d25b52a 100644
--- a/oslib-posix.c
+++ b/oslib-posix.c
@@ -35,7 +35,7 @@
  extern int daemon(int, int);
  #endif

-#if defined(__linux__)  defined(__x86_64__)
+#if defined(__linux__)  (defined(__x86_64__) || defined(__arm__))


Why not just drop the arch specific bit?


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v4 7/8] configure: Enable KVM on ARM

2012-11-20 Thread Peter Maydell
Enable KVM on ARM hosts, now that all the necessary components
for it exist.

Signed-off-by: Peter Maydell peter.mayd...@linaro.org
---
 configure |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure b/configure
index 780b19a..2438d6d 100755
--- a/configure
+++ b/configure
@@ -3927,7 +3927,7 @@ case $target_arch2 in
 echo CONFIG_NO_XEN=y  $config_target_mak
 esac
 case $target_arch2 in
-  i386|x86_64|ppcemb|ppc|ppc64|s390x)
+  arm|i386|x86_64|ppcemb|ppc|ppc64|s390x)
 # Make sure the target and host cpus are compatible
 if test $kvm = yes -a $target_softmmu = yes -a \
   \( $target_arch2 = $cpu -o \
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v4 2/8] ARM: KVM: Add support for KVM on ARM architecture

2012-11-20 Thread Peter Maydell
From: Christoffer Dall cd...@cs.columbia.edu

Add basic support for KVM on ARM architecture.

Signed-off-by: Christoffer Dall cd...@cs.columbia.edu
[PMM: Minor tweaks and code cleanup, switch to ONE_REG]
Signed-off-by: Peter Maydell peter.mayd...@linaro.org
---
 hw/arm_pic.c |   26 
 target-arm/Makefile.objs |1 +
 target-arm/cpu.h |1 +
 target-arm/helper.c  |2 +-
 target-arm/kvm.c |  327 ++
 5 files changed, 356 insertions(+), 1 deletion(-)
 create mode 100644 target-arm/kvm.c

diff --git a/hw/arm_pic.c b/hw/arm_pic.c
index ffb4d41..874bbaf 100644
--- a/hw/arm_pic.c
+++ b/hw/arm_pic.c
@@ -9,6 +9,7 @@
 
 #include hw.h
 #include arm-misc.h
+#include kvm.h
 
 /* Input 0 is IRQ and input 1 is FIQ.  */
 static void arm_pic_cpu_handler(void *opaque, int irq, int level)
@@ -34,7 +35,32 @@ static void arm_pic_cpu_handler(void *opaque, int irq, int 
level)
 }
 }
 
+static void kvm_arm_pic_cpu_handler(void *opaque, int irq, int level)
+{
+#ifdef CONFIG_KVM
+ARMCPU *cpu = opaque;
+CPUARMState *env = cpu-env;
+int kvm_irq = KVM_ARM_IRQ_TYPE_CPU  KVM_ARM_IRQ_TYPE_SHIFT;
+
+switch (irq) {
+case ARM_PIC_CPU_IRQ:
+kvm_irq |= KVM_ARM_IRQ_CPU_IRQ;
+break;
+case ARM_PIC_CPU_FIQ:
+kvm_irq |= KVM_ARM_IRQ_CPU_FIQ;
+break;
+default:
+hw_error(kvm_arm_pic_cpu_handler: Bad interrupt line %d\n, irq);
+}
+kvm_irq |= env-cpu_index  KVM_ARM_IRQ_VCPU_SHIFT;
+kvm_set_irq(kvm_state, kvm_irq, level ? 1 : 0);
+#endif
+}
+
 qemu_irq *arm_pic_init_cpu(ARMCPU *cpu)
 {
+if (kvm_enabled()) {
+return qemu_allocate_irqs(kvm_arm_pic_cpu_handler, cpu, 2);
+}
 return qemu_allocate_irqs(arm_pic_cpu_handler, cpu, 2);
 }
diff --git a/target-arm/Makefile.objs b/target-arm/Makefile.objs
index b6f1a9e..d89b57c 100644
--- a/target-arm/Makefile.objs
+++ b/target-arm/Makefile.objs
@@ -1,4 +1,5 @@
 obj-y += arm-semi.o
 obj-$(CONFIG_SOFTMMU) += machine.o
+obj-$(CONFIG_KVM) += kvm.o
 obj-y += translate.o op_helper.o helper.o cpu.o
 obj-y += neon_helper.o iwmmxt_helper.o
diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index e4ff918..44b5b4f 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -236,6 +236,7 @@ ARMCPU *cpu_arm_init(const char *cpu_model);
 void arm_translate_init(void);
 int cpu_arm_exec(CPUARMState *s);
 void do_interrupt(CPUARMState *);
+int bank_number(CPUARMState *env, int mode);
 void switch_mode(CPUARMState *, int);
 uint32_t do_arm_semihosting(CPUARMState *env);
 
diff --git a/target-arm/helper.c b/target-arm/helper.c
index ab8b734..6760d76 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -1614,7 +1614,7 @@ uint32_t HELPER(get_r13_banked)(CPUARMState *env, 
uint32_t mode)
 #else
 
 /* Map CPU modes onto saved register banks.  */
-static inline int bank_number(CPUARMState *env, int mode)
+int bank_number(CPUARMState *env, int mode)
 {
 switch (mode) {
 case ARM_CPU_MODE_USR:
diff --git a/target-arm/kvm.c b/target-arm/kvm.c
new file mode 100644
index 000..8e4b989
--- /dev/null
+++ b/target-arm/kvm.c
@@ -0,0 +1,327 @@
+/*
+ * ARM implementation of KVM hooks
+ *
+ * Copyright Christoffer Dall 2009-2010
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include stdio.h
+#include sys/types.h
+#include sys/ioctl.h
+#include sys/mman.h
+
+#include linux/kvm.h
+
+#include qemu-common.h
+#include qemu-timer.h
+#include sysemu.h
+#include kvm.h
+#include cpu.h
+#include hw/arm-misc.h
+
+const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
+KVM_CAP_LAST_INFO
+};
+
+int kvm_arch_init(KVMState *s)
+{
+/* For ARM interrupt delivery is always asynchronous,
+ * whether we are using an in-kernel VGIC or not.
+ */
+kvm_async_interrupts_allowed = true;
+return 0;
+}
+
+int kvm_arch_init_vcpu(CPUARMState *env)
+{
+struct kvm_vcpu_init init;
+
+init.target = KVM_ARM_TARGET_CORTEX_A15;
+memset(init.features, 0, sizeof(init.features));
+return kvm_vcpu_ioctl(env, KVM_ARM_VCPU_INIT, init);
+}
+
+typedef struct Reg {
+uint64_t id;
+int offset;
+} Reg;
+
+#define COREREG(KERNELNAME, QEMUFIELD)   \
+{\
+KVM_REG_ARM | KVM_REG_SIZE_U32 | \
+KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(KERNELNAME), \
+offsetof(CPUARMState, QEMUFIELD) \
+}
+
+#define CP15REG(CRN, CRM, OPC1, OPC2, QEMUFIELD) \
+{\
+KVM_REG_ARM | KVM_REG_SIZE_U32 | \
+(15  KVM_REG_ARM_COPROC_SHIFT) |   \
+((CRN)  KVM_REG_ARM_32_CRN_SHIFT) |\
+((CRM)  KVM_REG_ARM_CRM_SHIFT) |   \
+((OPC1)  KVM_REG_ARM_OPC1_SHIFT) | \
+((OPC2)  KVM_REG_ARM_32_OPC2_SHIFT),   \
+

[RFC v4 3/8] ARM KVM: save and load VFP registers from kernel

2012-11-20 Thread Peter Maydell
Add support for saving and restoring VFP register state from the
kernel. This includes a check that the KVM-created CPU has full
VFP support (as the TCG Cortex-A15 model always does), since for
the moment ARM QEMU doesn't have any way to tweak optional features
on created CPUs.

Signed-off-by: Peter Maydell peter.mayd...@linaro.org
---
 target-arm/kvm.c |   78 +++---
 1 file changed, 75 insertions(+), 3 deletions(-)

diff --git a/target-arm/kvm.c b/target-arm/kvm.c
index 8e4b989..4217ad6 100644
--- a/target-arm/kvm.c
+++ b/target-arm/kvm.c
@@ -38,10 +38,28 @@ int kvm_arch_init(KVMState *s)
 int kvm_arch_init_vcpu(CPUARMState *env)
 {
 struct kvm_vcpu_init init;
+int ret;
+uint64_t v;
+struct kvm_one_reg r;
 
 init.target = KVM_ARM_TARGET_CORTEX_A15;
 memset(init.features, 0, sizeof(init.features));
-return kvm_vcpu_ioctl(env, KVM_ARM_VCPU_INIT, init);
+ret = kvm_vcpu_ioctl(env, KVM_ARM_VCPU_INIT, init);
+if (ret) {
+return ret;
+}
+/* Query the kernel to make sure it supports 32 VFP
+ * registers: QEMU's cortex-a15 CPU is always a
+ * VFP-D32 core. The simplest way to do this is just
+ * to attempt to read register d31.
+ */
+r.id = KVM_REG_ARM | KVM_REG_SIZE_U64 | KVM_REG_ARM_VFP | 31;
+r.addr = (uintptr_t)(v);
+ret = kvm_vcpu_ioctl(env, KVM_GET_ONE_REG, r);
+if (ret == ENOENT) {
+return EINVAL;
+}
+return ret;
 }
 
 typedef struct Reg {
@@ -67,6 +85,13 @@ typedef struct Reg {
 offsetof(CPUARMState, QEMUFIELD) \
 }
 
+#define VFPSYSREG(R)   \
+{  \
+KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_VFP | \
+KVM_REG_ARM_VFP_##R,   \
+offsetof(CPUARMState, vfp.xregs[ARM_VFP_##R])  \
+}
+
 static const Reg regs[] = {
 /* R0_usr .. R14_usr */
 COREREG(usr_regs.uregs[0], regs[0]),
@@ -114,6 +139,13 @@ static const Reg regs[] = {
 CP15REG(1, 0, 0, 0, cp15.c1_sys), /* SCTLR */
 CP15REG(2, 0, 0, 2, cp15.c2_control), /* TTBCR */
 CP15REG(3, 0, 0, 0, cp15.c3), /* DACR */
+/* VFP system registers */
+VFPSYSREG(FPSID),
+VFPSYSREG(MVFR1),
+VFPSYSREG(MVFR0),
+VFPSYSREG(FPEXC),
+VFPSYSREG(FPINST),
+VFPSYSREG(FPINST2),
 };
 
 int kvm_arch_put_registers(CPUARMState *env, int level)
@@ -121,7 +153,7 @@ int kvm_arch_put_registers(CPUARMState *env, int level)
 struct kvm_one_reg r;
 int mode, bn;
 int ret, i;
-uint32_t cpsr;
+uint32_t cpsr, fpscr;
 uint64_t ttbr;
 
 /* Make sure the banked regs are properly set */
@@ -172,6 +204,26 @@ int kvm_arch_put_registers(CPUARMState *env, int level)
 (2  KVM_REG_ARM_CRM_SHIFT) | (1  KVM_REG_ARM_OPC1_SHIFT);
 r.addr = (uintptr_t)(ttbr);
 ret = kvm_vcpu_ioctl(env, KVM_SET_ONE_REG, r);
+if (ret) {
+return ret;
+}
+
+/* VFP registers */
+r.id = KVM_REG_ARM | KVM_REG_SIZE_U64 | KVM_REG_ARM_VFP;
+for (i = 0; i  32; i++) {
+r.addr = (uintptr_t)(env-vfp.regs[i]);
+ret = kvm_vcpu_ioctl(env, KVM_SET_ONE_REG, r);
+if (ret) {
+return ret;
+}
+r.id++;
+}
+
+r.id = KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_VFP |
+KVM_REG_ARM_VFP_FPSCR;
+fpscr = vfp_get_fpscr(env);
+r.addr = (uintptr_t)fpscr;
+ret = kvm_vcpu_ioctl(env, KVM_SET_ONE_REG, r);
 
 return ret;
 }
@@ -181,7 +233,7 @@ int kvm_arch_get_registers(CPUARMState *env)
 struct kvm_one_reg r;
 int mode, bn;
 int ret, i;
-uint32_t cpsr;
+uint32_t cpsr, fpscr;
 uint64_t ttbr;
 
 for (i = 0; i  ARRAY_SIZE(regs); i++) {
@@ -246,6 +298,26 @@ int kvm_arch_get_registers(CPUARMState *env)
 env-cp15.c2_mask = ~(0xu  env-cp15.c2_control);
 env-cp15.c2_base_mask = ~(0x3fffu  env-cp15.c2_control);
 
+/* VFP registers */
+r.id = KVM_REG_ARM | KVM_REG_SIZE_U64 | KVM_REG_ARM_VFP;
+for (i = 0; i  32; i++) {
+r.addr = (uintptr_t)(env-vfp.regs[i]);
+ret = kvm_vcpu_ioctl(env, KVM_GET_ONE_REG, r);
+if (ret) {
+return ret;
+}
+r.id++;
+}
+
+r.id = KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_VFP |
+KVM_REG_ARM_VFP_FPSCR;
+r.addr = (uintptr_t)fpscr;
+ret = kvm_vcpu_ioctl(env, KVM_GET_ONE_REG, r);
+if (ret) {
+return ret;
+}
+vfp_set_fpscr(env, fpscr);
+
 return 0;
 }
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v4 4/8] hw/arm_gic: Add presave/postload hooks

2012-11-20 Thread Peter Maydell
Add presave/postload hooks to the ARM GIC common base class.
These will be used by the KVM in-kernel GIC subclass to sync
state between kernel and userspace when migrating.

Signed-off-by: Peter Maydell peter.mayd...@linaro.org
Reviewed-by: Andreas Färber afaer...@suse.de
---
 hw/arm_gic_common.c   |   10 ++
 hw/arm_gic_internal.h |2 ++
 2 files changed, 12 insertions(+)

diff --git a/hw/arm_gic_common.c b/hw/arm_gic_common.c
index 8369309..961b44c 100644
--- a/hw/arm_gic_common.c
+++ b/hw/arm_gic_common.c
@@ -23,9 +23,14 @@
 static void gic_save(QEMUFile *f, void *opaque)
 {
 GICState *s = (GICState *)opaque;
+ARMGICCommonClass *c = ARM_GIC_COMMON_GET_CLASS(s);
 int i;
 int j;
 
+if (c-pre_save) {
+c-pre_save(s);
+}
+
 qemu_put_be32(f, s-enabled);
 for (i = 0; i  s-num_cpu; i++) {
 qemu_put_be32(f, s-cpu_enabled[i]);
@@ -57,6 +62,7 @@ static void gic_save(QEMUFile *f, void *opaque)
 static int gic_load(QEMUFile *f, void *opaque, int version_id)
 {
 GICState *s = (GICState *)opaque;
+ARMGICCommonClass *c = ARM_GIC_COMMON_GET_CLASS(s);
 int i;
 int j;
 
@@ -91,6 +97,10 @@ static int gic_load(QEMUFile *f, void *opaque, int 
version_id)
 s-irq_state[i].trigger = qemu_get_byte(f);
 }
 
+if (c-post_load) {
+c-post_load(s);
+}
+
 return 0;
 }
 
diff --git a/hw/arm_gic_internal.h b/hw/arm_gic_internal.h
index 699352c..3640be0 100644
--- a/hw/arm_gic_internal.h
+++ b/hw/arm_gic_internal.h
@@ -118,6 +118,8 @@ void gic_init_irqs_and_distributor(GICState *s, int 
num_irq);
 
 typedef struct ARMGICCommonClass {
 SysBusDeviceClass parent_class;
+void (*pre_save)(GICState *s);
+void (*post_load)(GICState *s);
 } ARMGICCommonClass;
 
 #define TYPE_ARM_GIC arm_gic
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v4 5/8] target-arm: Use MemoryListener to identify GIC base address for KVM

2012-11-20 Thread Peter Maydell
When using an in-kernel GIC with KVM, we need to tell the kernel where
the GIC's memory mapped registers live. Do this by registering a
MemoryListener which tracks where the board model maps the A15's
private peripherals, so we can finish the GIC initialisation
when the GIC is actually mapped.

Signed-off-by: Peter Maydell peter.mayd...@linaro.org
---
 target-arm/kvm.c |   83 ++
 target-arm/kvm_arm.h |   32 +++
 2 files changed, 115 insertions(+)
 create mode 100644 target-arm/kvm_arm.h

diff --git a/target-arm/kvm.c b/target-arm/kvm.c
index 4217ad6..ff3007b 100644
--- a/target-arm/kvm.c
+++ b/target-arm/kvm.c
@@ -19,6 +19,7 @@
 #include qemu-timer.h
 #include sysemu.h
 #include kvm.h
+#include kvm_arm.h
 #include cpu.h
 #include hw/arm-misc.h
 
@@ -62,6 +63,88 @@ int kvm_arch_init_vcpu(CPUARMState *env)
 return ret;
 }
 
+/* We track all the KVM devices which need their memory addresses
+ * passing to the kernel in a list of these structures.
+ * When board init is complete we run through the list and
+ * tell the kernel the base addresses of the memory regions.
+ * We use a MemoryListener to track mapping and unmapping of
+ * the regions during board creation, so the board models don't
+ * need to do anything special for the KVM case.
+ */
+typedef struct KVMDevice {
+struct kvm_device_address kda;
+MemoryRegion *mr;
+QSLIST_ENTRY(KVMDevice) entries;
+} KVMDevice;
+
+static QSLIST_HEAD(kvm_devices_head, KVMDevice) kvm_devices_head;
+
+static void kvm_arm_devlistener_add(MemoryListener *listener,
+MemoryRegionSection *section)
+{
+KVMDevice *kd;
+QSLIST_FOREACH(kd, kvm_devices_head, entries) {
+if (section-mr == kd-mr) {
+kd-kda.addr = section-offset_within_address_space;
+}
+}
+}
+
+static void kvm_arm_devlistener_del(MemoryListener *listener,
+MemoryRegionSection *section)
+{
+KVMDevice *kd;
+QSLIST_FOREACH(kd, kvm_devices_head, entries) {
+if (section-mr == kd-mr) {
+kd-kda.addr = -1;
+}
+}
+}
+
+static MemoryListener devlistener = {
+.region_add = kvm_arm_devlistener_add,
+.region_del = kvm_arm_devlistener_del,
+};
+
+static void kvm_arm_machine_init_done(Notifier *notifier, void *data)
+{
+KVMDevice *kd, *tkd;
+memory_listener_unregister(devlistener);
+QSLIST_FOREACH_SAFE(kd, kvm_devices_head, entries, tkd) {
+if (kd-kda.addr != -1) {
+if (kvm_vm_ioctl(kvm_state, KVM_SET_DEVICE_ADDRESS, kd-kda)  0) 
{
+fprintf(stderr, KVM_SET_DEVICE_ADDRESS failed: %s\n,
+strerror(errno));
+abort();
+}
+}
+g_free(kd);
+}
+}
+
+static Notifier notify = {
+.notify = kvm_arm_machine_init_done,
+};
+
+void kvm_arm_register_device(MemoryRegion *mr, uint64_t devid)
+{
+KVMDevice *kd;
+
+if (!kvm_irqchip_in_kernel()) {
+return;
+}
+
+if (QSLIST_EMPTY(kvm_devices_head)) {
+memory_listener_register(devlistener, NULL);
+qemu_add_machine_init_done_notifier(notify);
+}
+kd = g_new0(KVMDevice, 1);
+kd-mr = mr;
+kd-kda.id = devid;
+kd-kda.addr = -1;
+QSLIST_INSERT_HEAD(kvm_devices_head, kd, entries);
+}
+
 typedef struct Reg {
 uint64_t id;
 int offset;
diff --git a/target-arm/kvm_arm.h b/target-arm/kvm_arm.h
new file mode 100644
index 000..2f61d00
--- /dev/null
+++ b/target-arm/kvm_arm.h
@@ -0,0 +1,32 @@
+/*
+ * QEMU KVM support -- ARM specific functions.
+ *
+ * Copyright (c) 2012 Linaro Limited
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_KVM_ARM_H
+#define QEMU_KVM_ARM_H
+
+#include kvm.h
+#include memory.h
+
+/**
+ * kvm_arm_register_device:
+ * @mr: memory region for this device
+ * @devid: the KVM device ID
+ *
+ * Remember the memory region @mr, and when it is mapped by the
+ * machine model, tell the kernel that base address using the
+ * KVM_SET_DEVICE_ADDRESS ioctl. @devid should be the ID of
+ * the device as defined by KVM_SET_DEVICE_ADDRESS.
+ * The machine model may map and unmap the device multiple times;
+ * the kernel will only be told the final address at the point
+ * where machine init is complete.
+ */
+void kvm_arm_register_device(MemoryRegion *mr, uint64_t devid);
+
+#endif
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvmarm] [RFC v4 8/8] oslib-posix: Align to permit transparent hugepages on ARM Linux

2012-11-20 Thread Christoffer Dall
On Tue, Nov 20, 2012 at 8:54 AM, Alexander Graf ag...@suse.de wrote:
 On 11/20/2012 02:31 PM, Peter Maydell wrote:

 ARM Linux (like x86-64 Linux) can use transparent hugepages for
 KVM if memory blocks are 2MiB aligned; set QEMU_VMALLOC_ALIGN
 accordingly.

 Signed-off-by: Peter Maydellpeter.mayd...@linaro.org
 ---
   oslib-posix.c |2 +-
   1 file changed, 1 insertion(+), 1 deletion(-)

 diff --git a/oslib-posix.c b/oslib-posix.c
 index 9db9c3d..d25b52a 100644
 --- a/oslib-posix.c
 +++ b/oslib-posix.c
 @@ -35,7 +35,7 @@
   extern int daemon(int, int);
   #endif

 -#if defined(__linux__)  defined(__x86_64__)
 +#if defined(__linux__)  (defined(__x86_64__) || defined(__arm__))


 Why not just drop the arch specific bit?


other archs have other alignment requirements, iirc.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH 0/4] AER-KVM: Error containment of PCI pass-thru devices assigned to KVM guests

2012-11-20 Thread Pandarathil, Vijaymohan R


 -Original Message-
 From: Stefan Hajnoczi [mailto:stefa...@gmail.com]
 Sent: Tuesday, November 20, 2012 5:41 AM
 To: Pandarathil, Vijaymohan R
 Cc: kvm@vger.kernel.org; linux-...@vger.kernel.org; qemu-de...@nongnu.org;
 linux-ker...@vger.kernel.org
 Subject: Re: [PATCH 0/4] AER-KVM: Error containment of PCI pass-thru
 devices assigned to KVM guests
 
 On Tue, Nov 20, 2012 at 06:31:48AM +, Pandarathil, Vijaymohan R wrote:
  Add support for error containment when a PCI pass-thru device assigned to
 a KVM
  guest encounters an error. This is for PCIe devices/drivers that support
 AER
  functionality. When the OS is notified of an error in a device either
  through the firmware first approach or through an interrupt handled by
 the AER
  root port driver, concerned subsystems are notified by invoking callbacks
  registered by these subsystems. The device is also marked as tainted till
 the
  corresponding driver recovery routines are successful.
 
  KVM module registers for a notification of such errors. In the KVM
 callback
  routine, a global counter is incremented to keep track of the error
  notification. Before each CPU enters guest mode to execute guest code,
  appropriate checks are done to see if the impacted device belongs to the
 guest
  or not. If the device belongs to the guest, qemu hypervisor for the guest
 is
  informed and the guest is immediately brought down, thus preventing or
  minimizing chances of any bad data being written out by the guest driver
  after the device has encountered an error.
 
 I'm surprised that the hypervisor would shut down the guest when PCIe
 AER kicks in for a pass-through device.  Shouldn't we pass the AER event
 into the guest and deal with it there?

Agreed. That would be the ideal behavior and is planned in a future patch.
Lack of control over the capabilities/type of the OS/drivers running in 
the guest is also a concern in passing along the event to the guest.

My understanding is that in the current implementation of Linux/KVM, these 
errors are not handled at all and can potentially cause a guest hang or 
crash or even data corruption depending on the implementation of the guest
driver for the device. As a first step, these patches make the behavior 
better by doing error containment with a predictable behavior when such
errors occur. 

 
 The equivalent to this policy on physical hardware would be that the CPU
 is reset or the machine is powered down on AER.  That doesn't sound
 right.
 
 Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvmarm] [RFC v4 8/8] oslib-posix: Align to permit transparent hugepages on ARM Linux

2012-11-20 Thread Alexander Graf

On 11/20/2012 02:55 PM, Christoffer Dall wrote:

On Tue, Nov 20, 2012 at 8:54 AM, Alexander Grafag...@suse.de  wrote:

On 11/20/2012 02:31 PM, Peter Maydell wrote:

ARM Linux (like x86-64 Linux) can use transparent hugepages for
KVM if memory blocks are 2MiB aligned; set QEMU_VMALLOC_ALIGN
accordingly.

Signed-off-by: Peter Maydellpeter.mayd...@linaro.org
---
   oslib-posix.c |2 +-
   1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/oslib-posix.c b/oslib-posix.c
index 9db9c3d..d25b52a 100644
--- a/oslib-posix.c
+++ b/oslib-posix.c
@@ -35,7 +35,7 @@
   extern int daemon(int, int);
   #endif

-#if defined(__linux__)   defined(__x86_64__)
+#if defined(__linux__)   (defined(__x86_64__) || defined(__arm__))


Why not just drop the arch specific bit?



other archs have other alignment requirements, iirc.


Ah, sorry, missed the rest of the lines around this one :). Yeah, should 
be ok then.



Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvmarm] [RFC v4 8/8] oslib-posix: Align to permit transparent hugepages on ARM Linux

2012-11-20 Thread Peter Maydell
On 20 November 2012 14:37, Alexander Graf ag...@suse.de wrote:
 On 11/20/2012 02:55 PM, Christoffer Dall wrote:
 On Tue, Nov 20, 2012 at 8:54 AM, Alexander Grafag...@suse.de  wrote:
 On 11/20/2012 02:31 PM, Peter Maydell wrote:
 ARM Linux (like x86-64 Linux) can use transparent hugepages for
 KVM if memory blocks are 2MiB aligned; set QEMU_VMALLOC_ALIGN
 accordingly.

 -#if defined(__linux__)   defined(__x86_64__)
 +#if defined(__linux__)   (defined(__x86_64__) || defined(__arm__))


 Why not just drop the arch specific bit?

 other archs have other alignment requirements, iirc.

 Ah, sorry, missed the rest of the lines around this one :).
 Yeah, should be ok then.

Yeah. Ideally the kernel would provide a mechanism so we can
ask at runtime what the preferred alignment is. (Or it could
just automatically provide it for suitably large allocations.)
In the absence of that this is just following along with the
current style.

-- PMM
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM call agenda for 2012-11-20

2012-11-20 Thread Juan Quintela
Juan Quintela quint...@redhat.com wrote:
 Hi

 Please send in any agenda topics you are interested in.

As there are no topic's call is cancelled.

happy hacking, Juan.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


messed up with xml-files and configuration of a VM

2012-11-20 Thread Lentes, Bernd
Hi,

first, i'm new to kvm. I'm running KVM on a sles 11 sp2, kernel 
3.0.13-0.27-default. My guest is an Ubuntu 12.0.4 LTS 64bit.
The guest has attached a CDROM, using an iso-file from a CIFS-Share. I detached 
it with the virtual machine manager (0.9.0).
I don't see the cd-rom anymore in the virtual machine manager. But when i try 
to start the vm, it complains about the missing iso-file.
Why ? I detached it.
When i like to have a look in the xml-files of the guest, i found three ! One 
in /var/lib/kvm/images, one in /etc/libvirt/qemu and one in /etc/kvm/vm.
Which one should i use to configure the vm ? In the one in /etc/libvirt/qemu 
the cifs-share isn't mentioned any longer, in the other two it is still.
Is it possible to configure the vm editing one of the XML-files ?
Or shall i use virsh ? Using virsh, does the vm has to be stopped or can i edit 
the configuration for a running vm ?
Why three xml-files ? Why is detaching with the virtual machine manager not 
working ?

Thanks for any inspiration.


Bernd

--
Bernd Lentes

Systemadministration
Institut für Entwicklungsgenetik
Gebäude 35.34 - Raum 208
HelmholtzZentrum münchen
bernd.len...@helmholtz-muenchen.de
phone: +49 89 3187 1241
fax:   +49 89 3187 2294
http://www.helmholtz-muenchen.de/idg

Wir sollten nicht den Tod fürchten, sondern
das schlechte Leben

Helmholtz Zentrum München
Deutsches Forschungszentrum für Gesundheit und Umwelt (GmbH)
Ingolstädter Landstr. 1
85764 Neuherberg
www.helmholtz-muenchen.de
Aufsichtsratsvorsitzende: MinDir´in Bärbel Brumme-Bothe
Geschäftsführer: Prof. Dr. Günther Wess und Dr. Nikolaus Blum
Registergericht: Amtsgericht München HRB 6466
USt-IdNr: DE 129521671
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 13/16] kvm tools: keep track of registered memory banks in struct kvm

2012-11-20 Thread Will Deacon
Hi Sasha,

On Tue, Nov 13, 2012 at 04:37:38AM +, Sasha Levin wrote:
 On 11/12/2012 06:57 AM, Will Deacon wrote:
   struct kvm {
  struct kvm_arch arch;
  struct kvm_config   cfg;
  @@ -49,6 +56,7 @@ struct kvm {
  u64 ram_size;
  void*ram_start;
  u64 ram_pagesize;
  +   struct list_headmem_banks;
 
 These memory banks actually look like a perfect example to use our augmented 
 interval rb-tree,
 can we switch them to use it, or is it a list on purpose?

I found some time to look at this today but unfortunately they're not as
ideally suited to the interval tree as they look: the problem being that we
need to search for banks by both host virtual address *and* guest physical
address depending on the translation that we're doing.

We could have two separate tress, but that seems like overkill given the
likely number of banks.

Will
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] vfio powerpc: enabled and supported on powernv platform

2012-11-20 Thread Alex Williamson
On Tue, 2012-11-20 at 11:48 +1100, Alexey Kardashevskiy wrote:
 VFIO implements platform independent stuff such as
 a PCI driver, BAR access (via read/write on a file descriptor
 or direct mapping when possible) and IRQ signaling.
 The platform dependent part includes IOMMU initialization
 and handling.
 
 This patch initializes IOMMU groups based on the IOMMU
 configuration discovered during the PCI scan, only POWERNV
 platform is supported at the moment.
 
 Also the patch implements an VFIO-IOMMU driver which
 manages DMA mapping/unmapping requests coming from
 the client (now QEMU). It also returns a DMA window
 information to let the guest initialize the device tree
 for a guest OS properly. Although this driver has been
 tested only on POWERNV, it should work on any platform
 supporting TCE tables.
 
 To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
 option.
 
 Cc: David Gibson da...@gibson.dropbear.id.au
 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
 ---
  arch/powerpc/include/asm/iommu.h |6 +
  arch/powerpc/kernel/iommu.c  |  140 +++
  arch/powerpc/platforms/powernv/pci.c |  135 +++
  drivers/iommu/Kconfig|8 ++
  drivers/vfio/Kconfig |6 +
  drivers/vfio/Makefile|1 +
  drivers/vfio/vfio_iommu_spapr_tce.c  |  247 
 ++
  include/linux/vfio.h |   20 +++
  8 files changed, 563 insertions(+)
  create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
 
 diff --git a/arch/powerpc/include/asm/iommu.h 
 b/arch/powerpc/include/asm/iommu.h
 index cbfe678..5ba66cb 100644
 --- a/arch/powerpc/include/asm/iommu.h
 +++ b/arch/powerpc/include/asm/iommu.h
 @@ -64,30 +64,33 @@ struct iommu_pool {
  } cacheline_aligned_in_smp;
  
  struct iommu_table {
   unsigned long  it_busno; /* Bus number this table belongs to */
   unsigned long  it_size;  /* Size of iommu table in entries */
   unsigned long  it_offset;/* Offset into global table */
   unsigned long  it_base;  /* mapped address of tce table */
   unsigned long  it_index; /* which iommu table this is */
   unsigned long  it_type;  /* type: PCI or Virtual Bus */
   unsigned long  it_blocksize; /* Entries in each block (cacheline) */
   unsigned long  poolsize;
   unsigned long  nr_pools;
   struct iommu_pool large_pool;
   struct iommu_pool pools[IOMMU_NR_POOLS];
   unsigned long *it_map;   /* A simple allocation bitmap for now */
 +#ifdef CONFIG_IOMMU_API
 + struct iommu_group *it_group;
 +#endif
  };
  
  struct scatterlist;
  
  static inline void set_iommu_table_base(struct device *dev, void *base)
  {
   dev-archdata.dma_data.iommu_table_base = base;
  }
  
  static inline void *get_iommu_table_base(struct device *dev)
  {
   return dev-archdata.dma_data.iommu_table_base;
  }
  
  /* Frees table for an individual device node */
 @@ -135,17 +138,20 @@ static inline void pci_iommu_init(void) { }
  extern void alloc_dart_table(void);
  #if defined(CONFIG_PPC64)  defined(CONFIG_PM)
  static inline void iommu_save(void)
  {
   if (ppc_md.iommu_save)
   ppc_md.iommu_save();
  }
  
  static inline void iommu_restore(void)
  {
   if (ppc_md.iommu_restore)
   ppc_md.iommu_restore();
  }
  #endif
  
 +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, 
 uint64_t tce,
 + enum dma_data_direction direction, unsigned long pages);
 +
  #endif /* __KERNEL__ */
  #endif /* _ASM_IOMMU_H */
 diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
 index ff5a6ce..94f614b 100644
 --- a/arch/powerpc/kernel/iommu.c
 +++ b/arch/powerpc/kernel/iommu.c
 @@ -32,30 +32,31 @@
  #include linux/dma-mapping.h
  #include linux/bitmap.h
  #include linux/iommu-helper.h
  #include linux/crash_dump.h
  #include linux/hash.h
  #include linux/fault-inject.h
  #include linux/pci.h
  #include asm/io.h
  #include asm/prom.h
  #include asm/iommu.h
  #include asm/pci-bridge.h
  #include asm/machdep.h
  #include asm/kdump.h
  #include asm/fadump.h
  #include asm/vio.h
 +#include asm/tce.h
  
  #define DBG(...)
  
  static int novmerge;
  
  static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int);
  
  static int __init setup_iommu(char *str)
  {
   if (!strcmp(str, novmerge))
   novmerge = 1;
   else if (!strcmp(str, vmerge))
   novmerge = 0;
   return 1;
  }
 @@ -844,15 +845,154 @@ void *iommu_alloc_coherent(struct device *dev, struct 
 iommu_table *tbl,
  }
  
  void iommu_free_coherent(struct iommu_table *tbl, size_t size,
void *vaddr, dma_addr_t dma_handle)
  {
   if (tbl) {
   unsigned int nio_pages;
  
   size = PAGE_ALIGN(size);
   nio_pages = size  IOMMU_PAGE_SHIFT;
   iommu_free(tbl, dma_handle, nio_pages);
   size = 

Re: [PATCH 1/3] KVM: x86: clean up reexecute_instruction

2012-11-20 Thread Xiao Guangrong
On 11/20/2012 08:11 PM, Gleb Natapov wrote:
 On Tue, Nov 20, 2012 at 07:58:32AM +0800, Xiao Guangrong wrote:
 Little cleanup for reexecute_instruction, also use gpa_to_gfn in
 retry_instruction

 Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
 ---
  arch/x86/kvm/x86.c |   13 ++---
  1 files changed, 6 insertions(+), 7 deletions(-)

 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index 52ae8b5..7be8452 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -4477,19 +4477,18 @@ static bool reexecute_instruction(struct kvm_vcpu 
 *vcpu, gva_t gva)
  if (tdp_enabled)
  return false;

 +gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
 +if (gpa == UNMAPPED_GVA)
 +return true; /* let cpu generate fault */
 +
  /*
   * if emulation was due to access to shadowed page table
   * and it failed try to unshadow page and re-enter the
   * guest to let CPU execute the instruction.
   */
 -if (kvm_mmu_unprotect_page_virt(vcpu, gva))
 +if (kvm_mmu_unprotect_page(vcpu-kvm, c(gpa)))
 What's c()? Should be gpa_to_gfn(gpa)?

Yes. It is the stupid copy-paste error. Thanks you for pointing it out, Gleb!
This is the new one have fixed it.

Subject: [PATCH 1/3] KVM: x86: clean up reexecute_instruction

Little cleanup for reexecute_instruction, also use gpa_to_gfn in
retry_instruction

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/x86.c |   13 ++---
 1 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 52ae8b5..7be8452 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4477,19 +4477,18 @@ static bool reexecute_instruction(struct kvm_vcpu 
*vcpu, gva_t gva)
if (tdp_enabled)
return false;

+   gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
+   if (gpa == UNMAPPED_GVA)
+   return true; /* let cpu generate fault */
+
/*
 * if emulation was due to access to shadowed page table
 * and it failed try to unshadow page and re-enter the
 * guest to let CPU execute the instruction.
 */
-   if (kvm_mmu_unprotect_page_virt(vcpu, gva))
+   if (kvm_mmu_unprotect_page(vcpu-kvm, gpa_to_gfn(gpa)))
return true;

-   gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
-
-   if (gpa == UNMAPPED_GVA)
-   return true; /* let cpu generate fault */
-
/*
 * Do not retry the unhandleable instruction if it faults on the
 * readonly host memory, otherwise it will goto a infinite loop:
@@ -4544,7 +4543,7 @@ static bool retry_instruction(struct x86_emulate_ctxt 
*ctxt,
if (!vcpu-arch.mmu.direct_map)
gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);

-   kvm_mmu_unprotect_page(vcpu-kvm, gpa  PAGE_SHIFT);
+   kvm_mmu_unprotect_page(vcpu-kvm, gpa_to_gfn(gpa));

return true;
 }
-- 
1.7.7.6


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 13/16] kvm tools: keep track of registered memory banks in struct kvm

2012-11-20 Thread Sasha Levin
On 11/20/2012 12:15 PM, Will Deacon wrote:
 Hi Sasha,
 
 On Tue, Nov 13, 2012 at 04:37:38AM +, Sasha Levin wrote:
 On 11/12/2012 06:57 AM, Will Deacon wrote:
  struct kvm {
 struct kvm_arch arch;
 struct kvm_config   cfg;
 @@ -49,6 +56,7 @@ struct kvm {
 u64 ram_size;
 void*ram_start;
 u64 ram_pagesize;
 +   struct list_headmem_banks;

 These memory banks actually look like a perfect example to use our augmented 
 interval rb-tree,
 can we switch them to use it, or is it a list on purpose?
 
 I found some time to look at this today but unfortunately they're not as
 ideally suited to the interval tree as they look: the problem being that we
 need to search for banks by both host virtual address *and* guest physical
 address depending on the translation that we're doing.
 
 We could have two separate tress, but that seems like overkill given the
 likely number of banks.

Makes sense. We can convert it later if we need to as well.


Thanks,
Sasha

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v11] kvm: notify host when the guest is panicked

2012-11-20 Thread Marcelo Tosatti
On Tue, Nov 20, 2012 at 06:09:48PM +0800, Hu Tao wrote:
 Hi Marcelo,
 
 On Tue, Nov 13, 2012 at 12:19:08AM -0200, Marcelo Tosatti wrote:
  On Fri, Nov 09, 2012 at 03:17:39PM -0500, Sasha Levin wrote:
   On Mon, Nov 5, 2012 at 8:58 PM, Hu Tao hu...@cn.fujitsu.com wrote:
But in the case of panic notification, more dependency means more
chances of failure of panic notification. Say, if we use a virtio device
to do panic notification, then we will fail if: virtio itself has
problems, virtio for some reason can't be deployed(neither built-in or
as a module), or guest doesn't support virtio, etc.
   
   Add polling to your virtio device. If it didn't notify of a panic but
   taking more than 20 sec to answer your poll request you can assume
   it's dead.
   
   Actually, just use virtio-serial and something in userspace on the guest.
  
  They want the guest to stop, so a memory dump can be taken by management
  interface.
  
  Hu Tao, lets assume port I/O is the preferred method for communication.
 
 Okey.
 
  Now, the following comments have still not been addressed:
  
  1) Lifecycle of the stopped guest and interaction with other stopped
  states in QEMU.
 
 Patch 3 already deals with run state transitions. But in case I'm
 missing something, could you be more specific?

- What are the possibilities during migration? Say:
- migration starts.
- guest panics.
- migration starts vm on other side?
- Guest stopped due to EIO.
- guest vcpuN panics, VMEXIT but still outside QEMU.
- QEMU EIO error, stop vm.
- guest vcpuN completes, processes IO exit.
- system_reset due to panic.
- Add all possibilities that should be verified (that is, interaction 
of this feature with other stopped states in QEMU).

---

- What happens if the guest has reboot-on-panic configured? Does it take
precedence over hypervisor notification?



Out of curiosity, does kexec support memory dumping?

  2) Format of the interface for other architectures (you can choose
  a different KVM supported architecture and write an example).
  
  3) Clear/documented management interface for the feature.
 
 It is documented in patch 0: Documentation/virtual/kvm/pv_event.txt.
 Does it need to be improved?

This is documentation for the host-guest interface. There is no 
documentation on the interface for management.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5] kvm/fpu: Enable fully eager restore kvm FPU

2012-11-20 Thread Marcelo Tosatti
On Wed, Nov 07, 2012 at 10:01:11AM +0800, Xudong Hao wrote:
 Romove fpu lazy restore logic, using eager restore totally.
 
 v5 changes from v4:
 - remove lazy fpu restore totally, fpu eager restore does not have performance
 regression and simple the code.
 
 v4 changes from v3:
 - Wrap up some confused code with a clear function lazy_fpu_allowed()
 - Update fpu while update cr4 too.
 
 v3 changes from v2:
 - Make fpu active explicitly while guest xsave is enabling and non-lazy xstate
 bit exist.
 
 v2 changes from v1:
 - Expand KVM_XSTATE_LAZY to 64 bits before negating it.
 
 Signed-off-by: Xudong Hao xudong@intel.com
 ---
  arch/x86/kvm/vmx.c   | 9 ++---
  arch/x86/kvm/x86.c   | 8 +---
  include/linux/kvm_host.h | 1 -
  3 files changed, 3 insertions(+), 15 deletions(-)
 
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 6599e45..c1fd2e1 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -1197,7 +1197,7 @@ static void update_exception_bitmap(struct kvm_vcpu 
 *vcpu)
   u32 eb;
  
   eb = (1u  PF_VECTOR) | (1u  UD_VECTOR) | (1u  MC_VECTOR) |
 -  (1u  NM_VECTOR) | (1u  DB_VECTOR);
 +  (1u  DB_VECTOR);
   if ((vcpu-guest_debug 
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
   (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))

Please remove the code entirely, including:

if (is_no_device(intr_info)) {
vmx_fpu_activate(vcpu);
return 1;
}

and clts handling.

fpu_active/fpu_deactivate callbacks become unused, don't they?
Also remove fpu_active variable.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/5] KVM: MMU: move adjusting softmmu pte access to FNAME(page_fault)

2012-11-20 Thread Marcelo Tosatti
On Mon, Nov 05, 2012 at 08:12:07PM +0800, Xiao Guangrong wrote:
 Then, no mmu specified code exists in the common function and drop two
 parameters in set_spte
 
 Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
  arch/x86/kvm/mmu.c |   42 +++---
  arch/x86/kvm/paging_tmpl.h |   25 -
  2 files changed, 31 insertions(+), 36 deletions(-)
 
 diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
 index 49957df..4229e78 100644
 --- a/arch/x86/kvm/mmu.c
 +++ b/arch/x86/kvm/mmu.c
 @@ -2351,8 +2351,7 @@ static bool gfn_need_write_protect(struct kvm_vcpu 
 *vcpu, u64 *sptep,
 
  /* The return value indicates whether the @gfn need to be write protected. */
  static bool vcpu_adjust_access(struct kvm_vcpu *vcpu, u64 *sptep,
 -unsigned *pte_access, int user_fault,
 -int write_fault, int level, gfn_t gfn,
 +unsigned *pte_access, int level, gfn_t gfn,
  bool can_unsync, bool host_writable)
  {
   bool ret = false;
 @@ -2361,21 +2360,6 @@ static bool vcpu_adjust_access(struct kvm_vcpu *vcpu, 
 u64 *sptep,
   if (!host_writable)
   access = ~ACC_WRITE_MASK;
 
 - if (!(access  ACC_WRITE_MASK)  (!vcpu-arch.mmu.direct_map 
 -   write_fault  !is_write_protection(vcpu)  !user_fault)) {
 - access |= ACC_WRITE_MASK;
 - access = ~ACC_USER_MASK;
 -
 - /*
 -  * If we converted a user page to a kernel page,
 -  * so that the kernel can write to it when cr0.wp=0,
 -  * then we should prevent the kernel from executing it
 -  * if SMEP is enabled.
 -  */
 - if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
 - access = ~ACC_EXEC_MASK;
 - }
 -
   if ((access  ACC_WRITE_MASK) 
 gfn_need_write_protect(vcpu, sptep, level, gfn, can_unsync)) {
   access = ~ACC_WRITE_MASK;
 @@ -2387,8 +2371,7 @@ static bool vcpu_adjust_access(struct kvm_vcpu *vcpu, 
 u64 *sptep,
  }
 
  static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 - unsigned pte_access, int user_fault,
 - int write_fault, int level,
 + unsigned pte_access, int level,
   gfn_t gfn, pfn_t pfn, bool speculative,
   bool can_unsync, bool host_writable)
  {
 @@ -2398,8 +2381,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
   if (set_mmio_spte(sptep, gfn, pfn, pte_access))
   return 0;
 
 - ret = vcpu_adjust_access(vcpu, sptep, pte_access, user_fault,
 -   write_fault, level, gfn, can_unsync, host_writable);
 + ret = vcpu_adjust_access(vcpu, sptep, pte_access, level, gfn,
 +  can_unsync, host_writable);
 
   spte = PT_PRESENT_MASK;
   if (!speculative)
 @@ -2440,17 +2423,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
  static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pt_access, unsigned pte_access,
 -  int user_fault, int write_fault,
 -  int *emulate, int level, gfn_t gfn,
 -  pfn_t pfn, bool speculative,
 +  int write_fault, int *emulate, int level,
 +  gfn_t gfn, pfn_t pfn, bool speculative,
bool host_writable)
  {
   bool was_rmapped = false;
 
 - pgprintk(%s: spte %llx access %x write_fault %d
 -   user_fault %d gfn %llx\n,
 -  __func__, *sptep, pt_access,
 -  write_fault, user_fault, gfn);
 + pgprintk(%s: spte %llx access %x write_fault %d gfn %llx\n,
 +  __func__, *sptep, pt_access, write_fault, gfn);
 
   if (is_rmap_spte(*sptep)) {
   if (pfn != spte_to_pfn(*sptep)) {
 @@ -2462,7 +2442,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 
 *sptep,
   was_rmapped = true;
   }
 
 - if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
 + if (set_spte(vcpu, sptep, pte_access,
 level, gfn, pfn, speculative, true,
 host_writable)) {
   if (write_fault)
 @@ -2556,7 +2536,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu 
 *vcpu,
 
   for (i = 0; i  ret; i++, gfn++, start++)
   mmu_set_spte(vcpu, start, ACC_ALL,
 -  access, 0, 0, NULL,
 +  access, 0, NULL,
sp-role.level, gfn,
page_to_pfn(pages[i]), true, true);
 
 @@ -2620,7 +2600,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, 
 int write,
   unsigned pte_access = ACC_ALL;
 
   mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
 -  

Re: [PATCH 2/5] KVM: MMU: simplify mmu_set_spte

2012-11-20 Thread Marcelo Tosatti
On Tue, Nov 13, 2012 at 04:39:44PM +0800, Xiao Guangrong wrote:
 On 11/13/2012 07:12 AM, Marcelo Tosatti wrote:
  On Mon, Nov 05, 2012 at 08:10:08PM +0800, Xiao Guangrong wrote:
  In order to detecting spte remapping, we can simply check whether the
  spte has already been pointing to the pfn even if the spte is not the
  last spte for middle spte is pointing to the kernel pfn which can not
  be mapped to userspace
 
  Also, update slot and stat.lpages iff the spte is not remapped
 
  Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
  ---
   arch/x86/kvm/mmu.c |   40 +---
   1 files changed, 13 insertions(+), 27 deletions(-)
 
  diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
  index 692ebb1..4ea731e 100644
  --- a/arch/x86/kvm/mmu.c
  +++ b/arch/x86/kvm/mmu.c
  @@ -2420,8 +2420,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 
  *sptep,
  pfn_t pfn, bool speculative,
  bool host_writable)
   {
  -  int was_rmapped = 0;
  -  int rmap_count;
  +  bool was_rmapped = false;
 
 pgprintk(%s: spte %llx access %x write_fault %d
   user_fault %d gfn %llx\n,
  @@ -2429,25 +2428,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, 
  u64 *sptep,
  write_fault, user_fault, gfn);
 
 if (is_rmap_spte(*sptep)) {
  -  /*
  -   * If we overwrite a PTE page pointer with a 2MB PMD, unlink
  -   * the parent of the now unreachable PTE.
  -   */
  -  if (level  PT_PAGE_TABLE_LEVEL 
  -  !is_large_pte(*sptep)) {
  -  struct kvm_mmu_page *child;
  -  u64 pte = *sptep;
  +  if (pfn != spte_to_pfn(*sptep)) {
  +  struct kvm_mmu_page *sp = page_header(__pa(sptep));
 
  -  child = page_header(pte  PT64_BASE_ADDR_MASK);
  -  drop_parent_pte(child, sptep);
  -  kvm_flush_remote_tlbs(vcpu-kvm);
  
  How come its safe to drop this case?
 
 We use if (pfn != spte_to_pfn(*sptep)) to simplify the thing.
 There are two cases:
 1) the sptep is not the last mapping.
under this case, sptep must point to a shadow page table, that means
spte_to_pfn(*sptep)) is used by KVM module, and 'pfn' is used by userspace.
so, 'if' condition must be satisfied, the sptep will be dropped.
 
Actually, This is the origin case:
   | if (level  PT_PAGE_TABLE_LEVEL 
   |   !is_large_pte(*sptep))
 
 2) the sptep is the last mapping.
under this case, the level of spte (sp.level) must equal the 'level' which
we pass to mmu_set_spte. If they point to the same pfn, it is 'remap', 
 otherwise
we drop it.
 
 I think this is safe. :)

mmu_page_zap_pte takes care of it, OK.

What if was_rmapped=true but gfn is different? Say if the spte comes
from an unsync shadow page, the guest modifies that shadow page (but
does not invalidate it with invlpg), then faults. gfn can still point
to the same gfn (but in that case, with your patch,
page_header_update_slot is not called.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/5] KVM: MMU: simplify set_spte

2012-11-20 Thread Marcelo Tosatti
On Mon, Nov 05, 2012 at 08:11:03PM +0800, Xiao Guangrong wrote:
 It is more cleaner if we can update pte_access fist then set spte according
 to pte_access, also introduce gfn_need_write_protect to check whether the
 gfn need to be write-protected
 
 Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com

Please separate patch in:
- code movement with no logical modification.
- logical modification (such as condition for mark_page_dirty).
- move code to helper functions.

  arch/x86/kvm/mmu.c |  109 
 
  1 files changed, 67 insertions(+), 42 deletions(-)
 
 diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
 index 4ea731e..49957df 100644
 --- a/arch/x86/kvm/mmu.c
 +++ b/arch/x86/kvm/mmu.c
 @@ -2329,6 +2329,63 @@ static int mmu_need_write_protect(struct kvm_vcpu 
 *vcpu, gfn_t gfn,
   return 0;
  }
 
 +static bool gfn_need_write_protect(struct kvm_vcpu *vcpu, u64 *sptep,
 +int level,  gfn_t gfn, bool can_unsync)
 +{
 + /*
 +  * Optimization: for pte sync, if spte was writable the hash
 +  * lookup is unnecessary (and expensive). Write protection
 +  * is responsibility of mmu_get_page / kvm_sync_page.
 +  * Same reasoning can be applied to dirty page accounting.
 +  */
 + if (!can_unsync  is_writable_pte(*sptep))
 + return false;
 +
 + if ((level  PT_PAGE_TABLE_LEVEL 
 +has_wrprotected_page(vcpu-kvm, gfn, level)) ||
 +   mmu_need_write_protect(vcpu, gfn, can_unsync))
 + return true;
 +
 + return false;
 +}
 +
 +/* The return value indicates whether the @gfn need to be write protected. */
 +static bool vcpu_adjust_access(struct kvm_vcpu *vcpu, u64 *sptep,
 +unsigned *pte_access, int user_fault,
 +int write_fault, int level, gfn_t gfn,
 +bool can_unsync, bool host_writable)
 +{
 + bool ret = false;
 + unsigned access = *pte_access;
 +
 + if (!host_writable)
 + access = ~ACC_WRITE_MASK;
 +
 + if (!(access  ACC_WRITE_MASK)  (!vcpu-arch.mmu.direct_map 
 +   write_fault  !is_write_protection(vcpu)  !user_fault)) {
 + access |= ACC_WRITE_MASK;
 + access = ~ACC_USER_MASK;
 +
 + /*
 +  * If we converted a user page to a kernel page,
 +  * so that the kernel can write to it when cr0.wp=0,
 +  * then we should prevent the kernel from executing it
 +  * if SMEP is enabled.
 +  */
 + if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
 + access = ~ACC_EXEC_MASK;
 + }
 +
 + if ((access  ACC_WRITE_MASK) 
 +   gfn_need_write_protect(vcpu, sptep, level, gfn, can_unsync)) {
 + access = ~ACC_WRITE_MASK;
 + ret = true;
 + }
 +
 + *pte_access = access;
 + return ret;
 +}
 +
  static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
   unsigned pte_access, int user_fault,
   int write_fault, int level,
 @@ -2341,6 +2398,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
   if (set_mmio_spte(sptep, gfn, pfn, pte_access))
   return 0;
 
 + ret = vcpu_adjust_access(vcpu, sptep, pte_access, user_fault,
 +   write_fault, level, gfn, can_unsync, host_writable);
 +
   spte = PT_PRESENT_MASK;
   if (!speculative)
   spte |= shadow_accessed_mask;
 @@ -2353,61 +2413,26 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
   if (pte_access  ACC_USER_MASK)
   spte |= shadow_user_mask;
 
 + if (pte_access  ACC_WRITE_MASK) {
 + spte |= PT_WRITABLE_MASK;
 + spte |= SPTE_MMU_WRITEABLE;
 + }
 +
   if (level  PT_PAGE_TABLE_LEVEL)
   spte |= PT_PAGE_SIZE_MASK;
 +
   if (tdp_enabled)
   spte |= kvm_x86_ops-get_mt_mask(vcpu, gfn,
   kvm_is_mmio_pfn(pfn));
 
   if (host_writable)
   spte |= SPTE_HOST_WRITEABLE;
 - else
 - pte_access = ~ACC_WRITE_MASK;
 
   spte |= (u64)pfn  PAGE_SHIFT;
 
 - if ((pte_access  ACC_WRITE_MASK)
 - || (!vcpu-arch.mmu.direct_map  write_fault
 -  !is_write_protection(vcpu)  !user_fault)) {
 - spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
 -
 - if (!vcpu-arch.mmu.direct_map
 -  !(pte_access  ACC_WRITE_MASK)) {
 - spte = ~PT_USER_MASK;
 - /*
 -  * If we converted a user page to a kernel page,
 -  * so that the kernel can write to it when cr0.wp=0,
 -  * then we should prevent the kernel from executing it
 -  * if SMEP is enabled.
 -  */
 - if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
 - 

Re: Re: Re: [RFC PATCH 0/2] kvm/vmx: Output TSC offset

2012-11-20 Thread Marcelo Tosatti
On Tue, Nov 20, 2012 at 07:36:33PM +0900, Yoshihiro YUNOMAE wrote:
 Hi Marcelo,
 
 Sorry for the late reply.
 
 (2012/11/17 4:15), Marcelo Tosatti wrote:
 On Wed, Nov 14, 2012 at 05:26:10PM +0900, Yoshihiro YUNOMAE wrote:
 Thank you for commenting on my patch set.
 
 (2012/11/14 11:31), Steven Rostedt wrote:
 On Tue, 2012-11-13 at 18:03 -0800, David Sharp wrote:
 On Tue, Nov 13, 2012 at 6:00 PM, Steven Rostedt rost...@goodmis.org 
 wrote:
 On Wed, 2012-11-14 at 10:36 +0900, Yoshihiro YUNOMAE wrote:
 
 To merge the data like previous pattern, we apply this patch set. Then, 
 we can
 get TSC offset of the guest as follows:
 
 $ dmesg | grep kvm
 [   57.717180] kvm: (2687) write TSC offset 18446743360465545001, now 
 clock ##
         
   |
   PID TSC offset
   |
 HOST TSC 
  value --+
 
 
 Using printk to export something like this is IMO a nasty hack.
 
 Can't we create a /sys or /proc file to export the same thing?
 
 Since the value changes over the course of the trace, and seems to be
 part of the context of the trace, I think I'd include it as a
 tracepoint.
 
 
 I'm fine with that too.
 
 Using some tracepoint is a nice idea, but there is one problem. Here,
 our discussion point is the event which TSC offset is changed does not
 frequently occur, but the buffer must keep the event data.
 
 There are two ideas for using tracepoint. First, we define new
 tracepoint for changed TSC offset. This is simple and the overhead will
 be low. However, this trace event stored in the buffer will be
 overwritten by other trace events because this TSC offset event does
 not frequently occur. Second, we add TSC offset information to the
 tracepoint frequently occured. For example, we assume that TSC offset
 information is added to arguments of trace_kvm_exit().
 
 The TSC offset is in the host trace. So given a host trace with two TSC
 offset updates, how do you know which events in the guest trace
 (containing a number of events) refer to which tsc offset update?
 
 Unless i am missing something, you can't solve this easily (well, except
 exporting information to the guest that allows it to transform RDTSC -
 host TSC value, which can be done via pvclock).
 
 As you say, TSC offset events are in the host trace, but we don't need
 to notify guests of updating TSC offset. The offset event will output
 the next TSC offset value and the current TSC value, so we can
 calculate the guest TSC (T1) for the event. Guest TSCs since T1 can be
 converted to host TSC using the TSC offset, so we can integrate those
 trace data.

Think of this scenario:

host trace
1h. event tsc write tsc_offset=-1000
3h. vmenter
4h. vmexit
... (event sequence)
99h. vmexit
100h. event tsc_write tsc_offset=-2000
101h. vmenter
... (event sequence).
500h. event tsc_write tsc_offset=-3000

Then a guest trace containing events with a TSC timestamp.
Which tsc_offset to use? 

(that is the problem, which unless i am mistaken can only be solved
easily if the guest can convert RDTSC - TSC of host).

 Another issue as mentioned is lack of TSC synchronization in the host.
 Should you provide such a feature without the possibility of proper
 chronological order on systems with unsynchronized TSC?
 
 I think, we cannot support this sorting feature using TSC on systems
 with unsynchronized TSC. On systems with unsynchronized TSC, it is
 difficult to sort not only trace data of guests and the host but trace
 data of a guest or a host using TSC in chronological order. Actually,
 if we want to output tracing data of ftrace in chronological order with
 unsynchronized TSC, we will use the global mode as the timestamp. The
 global mode uses wallclock added TSC correction, so the mode guarantees
 to sort in chronological order for trace data of the guest or of
 the host. If we use this mode to sort the trace data of guests and the
 host in chronological order, we need to consider about the difference
 between the guest and the host and timekeeping of guests and the host,
 so it is difficult to solve these issues. At least, I haven't came up
 with the good solution.

I suppose the tradeoff is performance (RDTSC) versus reliability, when
using ftrace. But then, even ftrace on the host suffers from the
same problem, with unsynchronized TSCs.

 We cannot sort the trace data of guests and the host in chronological
 order with unsynchronized TSC, but if we can set following
 synchronization events for both guests and the host, we will know where
 we should sort.
 
 First, a guest and the host uses the global mode as the timestamp of
 ftrace. Next, a user on the guest writes 1 to the synchronization I/F
 as the ID, then the synchronization event 1 is recorded in a
 ring-buffer of the guest. The synchronization operation induces
 hypercall, so the host can handle the event. After the 

Re: [PATCH 2/5] KVM: MMU: simplify mmu_set_spte

2012-11-20 Thread Xiao Guangrong
On 11/21/2012 06:18 AM, Marcelo Tosatti wrote:

 -  child = page_header(pte  PT64_BASE_ADDR_MASK);
 -  drop_parent_pte(child, sptep);
 -  kvm_flush_remote_tlbs(vcpu-kvm);

 How come its safe to drop this case?

 We use if (pfn != spte_to_pfn(*sptep)) to simplify the thing.
 There are two cases:
 1) the sptep is not the last mapping.
under this case, sptep must point to a shadow page table, that means
spte_to_pfn(*sptep)) is used by KVM module, and 'pfn' is used by 
 userspace.
so, 'if' condition must be satisfied, the sptep will be dropped.

Actually, This is the origin case:
   | if (level  PT_PAGE_TABLE_LEVEL 
   |  !is_large_pte(*sptep))

 2) the sptep is the last mapping.
under this case, the level of spte (sp.level) must equal the 'level' which
we pass to mmu_set_spte. If they point to the same pfn, it is 'remap', 
 otherwise
we drop it.

 I think this is safe. :)
 
 mmu_page_zap_pte takes care of it, OK.
 
 What if was_rmapped=true but gfn is different? Say if the spte comes
 from an unsync shadow page, the guest modifies that shadow page (but
 does not invalidate it with invlpg), then faults. gfn can still point
 to the same gfn (but in that case, with your patch,
 page_header_update_slot is not called.

Marcelo,

Page fault path and other sync/prefetch paths will reread guest page table,
then it get a different target pfn.

The scenario is like this:

gfn1 = pfn1, gfn2 = pfn2
gpte = pfn1, spte is shadowed by gpte and it is a unsync spte

Guest   Host
 spte = (gfn1, pfn1)

modify gpte to let it point to gfn2
spte = (gfn1, pfn1)
page-fault on gpte
intercept the page-fault, then
want to update spte to (gfn2, pfn2)

in mmu_set_spte, we can detect
pfn2 != pfn1, then drop it.

Hmm, the interesting thing is what if different gfns map to the same pfn.
For example, spte1 is shadowed by gfn1 and spte2 is shadowed by pfn2, both
gfn1 and gfn2 map to pfn, the code (including the current code) will set
spte1 to the gfn2's rmap and spte2 to the gfn1's rmap. But i think it is ok.




--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/5] KVM: MMU: simplify set_spte

2012-11-20 Thread Xiao Guangrong
On 11/21/2012 06:24 AM, Marcelo Tosatti wrote:
 On Mon, Nov 05, 2012 at 08:11:03PM +0800, Xiao Guangrong wrote:
 It is more cleaner if we can update pte_access fist then set spte according
 to pte_access, also introduce gfn_need_write_protect to check whether the
 gfn need to be write-protected

 Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
 
 Please separate patch in:
 - code movement with no logical modification.
 - logical modification (such as condition for mark_page_dirty).
 - move code to helper functions.

Okay, will split it.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/5] KVM: MMU: move adjusting softmmu pte access to FNAME(page_fault)

2012-11-20 Thread Xiao Guangrong
On 11/21/2012 06:27 AM, Marcelo Tosatti wrote:

 @@ -544,6 +544,21 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, 
 gva_t addr, u32 error_code,
  return 0;
  }

 +if (write_fault  !(walker.pte_access  ACC_WRITE_MASK) 
 +  !is_write_protection(vcpu)  !user_fault) {
 +walker.pte_access |= ACC_WRITE_MASK;
 +walker.pte_access = ~ACC_USER_MASK;
 +
 +/*
 + * If we converted a user page to a kernel page,
 + * so that the kernel can write to it when cr0.wp=0,
 + * then we should prevent the kernel from executing it
 + * if SMEP is enabled.
 + */
 +if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
 +walker.pte_access = ~ACC_EXEC_MASK;
 +}
 +
 
 What about sync_page path?

The sync_page and other prefetch paths only do read-prefetch, means
they call set_spte with write_fault = 0.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/5] KVM: MMU: simplify mmu_set_spte

2012-11-20 Thread Marcelo Tosatti
On Wed, Nov 21, 2012 at 07:23:26AM +0800, Xiao Guangrong wrote:
 On 11/21/2012 06:18 AM, Marcelo Tosatti wrote:
 
  -child = page_header(pte  PT64_BASE_ADDR_MASK);
  -drop_parent_pte(child, sptep);
  -kvm_flush_remote_tlbs(vcpu-kvm);
 
  How come its safe to drop this case?
 
  We use if (pfn != spte_to_pfn(*sptep)) to simplify the thing.
  There are two cases:
  1) the sptep is not the last mapping.
 under this case, sptep must point to a shadow page table, that means
 spte_to_pfn(*sptep)) is used by KVM module, and 'pfn' is used by 
  userspace.
 so, 'if' condition must be satisfied, the sptep will be dropped.
 
 Actually, This is the origin case:
| if (level  PT_PAGE_TABLE_LEVEL 
|!is_large_pte(*sptep))
 
  2) the sptep is the last mapping.
 under this case, the level of spte (sp.level) must equal the 'level' 
  which
 we pass to mmu_set_spte. If they point to the same pfn, it is 'remap', 
  otherwise
 we drop it.
 
  I think this is safe. :)
  
  mmu_page_zap_pte takes care of it, OK.
  
  What if was_rmapped=true but gfn is different? Say if the spte comes
  from an unsync shadow page, the guest modifies that shadow page (but
  does not invalidate it with invlpg), then faults. gfn can still point
  to the same gfn (but in that case, with your patch,
  page_header_update_slot is not called.
 
 Marcelo,
 
 Page fault path and other sync/prefetch paths will reread guest page table,
 then it get a different target pfn.
 
 The scenario is like this:
 
 gfn1 = pfn1, gfn2 = pfn2
 gpte = pfn1, spte is shadowed by gpte and it is a unsync spte
 
 Guest   Host
  spte = (gfn1, pfn1)
 
 modify gpte to let it point to gfn2
 spte = (gfn1, pfn1)
 page-fault on gpte
 intercept the page-fault, then
 want to update spte to (gfn2, pfn2)
 
 in mmu_set_spte, we can detect
 pfn2 != pfn1, then drop it.
 
 Hmm, the interesting thing is what if different gfns map to the same pfn.
 For example, spte1 is shadowed by gfn1 and spte2 is shadowed by pfn2, both
 gfn1 and gfn2 map to pfn, the code (including the current code) will set
 spte1 to the gfn2's rmap and spte2 to the gfn1's rmap. But i think it is ok.

Current code updates gfn properly in set_spte by
page_header_update_slot. 

Better keep state properly.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Interrupt controller updates

2012-11-20 Thread Benjamin Herrenschmidt
Hi Jan !

David (CC) want to make some progress with our in-kernel PIC. From
memory, one of the outcomes of the BOF was that we need to move the
existing enable in-kernel PIC from generic KVM init to machine init in
order to be able to add an argument indicating the model use by the
arch/platform since some like ours support several different models and
since that all needs to be selected before the VCPUs are created.

Again, from memory, you were volunteered to do the initial x86 change so
we could piggy back on it :-) Or do I remember wrong ?

Cheers,
Ben.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: Guest performance is reduced after live migration

2012-11-20 Thread Shouta.Uehara
Dear all

I continue watching a mailing list whether a similar problem is reported 
because a problem does not seem to happen to others.
Any information, however small, would be appreciated.

regards

 -Original Message-
 From: Uehara, Shouta (shouta.ueh...@jp.yokogawa.com)
 Sent: Friday, November 09, 2012 6:52 PM
 To: 'kvm@vger.kernel.org'; 'Xiao Guangrong
 (xiaoguangr...@linux.vnet.ibm.com)'
 Subject: RE: Guest performance is reduced after live migration

 I've analysed the problem with migration using perf-events, and it is
 confirmed cost of glibc is significantly increased.
 I made a simple test code to execute read system call on the guest as follows,
 and examined the performance from host and guest.

 fd = open(/dev/zero, O_RDONLY);
 while (1) { read(fd ch, 1); }

 ---
 [Source host]

 $ sudo perf kvm --host --guest record -a -o src_host.perf sleep 10

 Events: 10K cycles
  89.36%  qemu-system-x86  [unknown]  [g]
 0x810a36ef
   9.04%  qemu-system-x86  [unknown]  [u]
 0x3fd20e2020
   0.16%  swapper  [kernel.kallsyms]  [k] intel_idle
   0.11%sleep  [kernel.kallsyms]  [k] page_fault

 [Guest on source host]

 $ perf record -o src_guest.perf ./loop_read

 Events: 29K cpu-clock
 11.71%  loop_read  [kernel.kallsyms]  [k] system_call_after_swapgs
  9.58%  loop_read  libc-2.14.90.so[.] __GI___libc_read
  6.92%  loop_read  [kernel.kallsyms]  [k] vfs_read
  5.53%  loop_read  [kernel.kallsyms]  [k] fsnotify

 __GI___libc_read
  :003fd20e2010 __read:
 2.31 :  3fd20e2010:   83 3d 1d 22 2d 00 00cmpl
 $0x0,0x2d221d(
 5.67 :  3fd20e2017:   75 10   jne
 3fd20e2029 __
  :
  :003fd20e2019 __read_nocancel:
 1.82 :  3fd20e2019:   b8 00 00 00 00  mov
 $0x0,%eax
 0.00 :  3fd20e201e:   0f 05   syscall
87.78 :  3fd20e2020:   48 3d 01 f0 ff ff   cmp
 $0xfff
 0.00 :  3fd20e2026:   73 31   jae
 3fd20e2059 __
 2.42 :  3fd20e2028:   c3  retq

 [Destination host]

 $ sudo perf kvm --host --guest record -a -o dst_host.perf sleep 10

 Events: 10K cycles
  58.39%  qemu-system-x86  [unknown]  [g]
 0x810a3a6e
  40.14%  qemu-system-x86  [unknown]  [u]
 0x3fd20e2017
   0.13%  gnome-shell  nouveau_dri.so [.] 0xbd7c9
   0.11%  swapper  [kernel.kallsyms]  [k] intel_idle

 [Guest on destination host]

 $ perf record -o dst_guest.perf ./loop_read

 Events: 29K cpu-clock
 41.95%  loop_read  libc-2.14.90.so[.] __GI___libc_read
  7.90%  loop_read  [kernel.kallsyms]  [k] system_call_after_swapgs
  4.61%  loop_read  [kernel.kallsyms]  [k] vfs_read
  3.72%  loop_read  [kernel.kallsyms]  [k] fsnotify

 __GI___libc_read
  :003fd20e2010 __read:
 0.41 :  3fd20e2010:   83 3d 1d 22 2d 00 00cmpl
 $0x0,0x2d221d(
86.10 :  3fd20e2017:   75 10   jne
 3fd20e2029 __
  :
  :003fd20e2019 __read_nocancel:
 0.33 :  3fd20e2019:   b8 00 00 00 00  mov
 $0x0,%eax
 0.00 :  3fd20e201e:   0f 05   syscall
12.84 :  3fd20e2020:   48 3d 01 f0 ff ff   cmp
 $0xfff
 0.00 :  3fd20e2026:   73 31   jae
 3fd20e2059 __
 0.33 :  3fd20e2028:   c3  retq

 ---

 After the migration, jne instruction in __read became to take a long time.
 This is due to increase the overhead of read system call. The performance
 was not improved even if thp/hugetlb was disabled.
 Does this problem happen only to me? Should I also ask other community about
 this problem?

Shota

  -Original Message-
  From: Xiao Guangrong [mailto:xiaoguangr...@linux.vnet.ibm.com]
  Sent: Thursday, November 01, 2012 1:45 PM
  To: Uehara, Shouta (shouta.ueh...@jp.yokogawa.com)
  Cc: kvm@vger.kernel.org
  Subject: Re: Guest performance is reduced after live migration
 
  Shouta,
 
  Can it be reproduced if thp/hugetlbfs is disabled on both source and
  destination?
 
  On 11/01/2012 08:12 AM, shouta.ueh...@jp.yokogawa.com wrote:
   Hello.
  
   I have a problem with the performance of the guest Linux after live
  migration.
   When I analyze the file I/O latency of the guest using LMbench3, the
   latency of the guest on the destination host is about 2 times bigger
   than the guest on the source host. As a result that I investigated
   it, this problem occurs if three following conditions are right.
  
 1. Use the kernel version 2.6.38.6-26.fc15.x86_64 or later.
 2. Execute system calls for low level file I/O (read, write, open
   etc.)
  on the
guest.
 3. Enable EPT.
  
   Performance cannot decrease on other tests of 

[PATCH v5 0/2] x86: vmclear vmcss on all cpus when doing kdump if necessary

2012-11-20 Thread Zhang Yanfei
Currently, kdump just makes all the logical processors leave VMX operation by
executing VMXOFF instruction, so any VMCSs active on the logical processors may
be corrupted. But, sometimes, we need the VMCSs to debug guest images contained
in the host vmcore. To prevent the corruption, we should VMCLEAR the VMCSs 
before
executing the VMXOFF instruction.

The patch set provides a way to VMCLEAR vmcss related to guests on all cpus 
before
executing the VMXOFF when doing kdump. This is used to ensure the VMCSs in the
vmcore updated and non-corrupted.

Changelog from v4 to v5:
1. use an atomic notifier instead of function call, so
   have all the vmclear codes in vmx.c.

Changelog from v3 to v4:
1. add a new percpu variable vmclear_skipped to skip
   vmclear in kdump in some conditions.

Changelog from v2 to v3:
1. remove unnecessary conditions in function
   cpu_emergency_clear_loaded_vmcss as Marcelo suggested.

Changelog from v1 to v2:
1. remove the sysctl and clear VMCSs unconditionally.

Zhang Yanfei (2):
  x86/kexec: VMCLEAR vmcss on all cpus if necessary
  KVM-INTEL: add a notifier and a bitmap to support VMCLEAR in kdump

 arch/x86/include/asm/kexec.h |2 +
 arch/x86/kernel/crash.c  |   25 
 arch/x86/kvm/vmx.c   |   85 ++
 3 files changed, 112 insertions(+), 0 deletions(-)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 1/2] x86/kexec: VMCLEAR vmcss on all cpus if necessary

2012-11-20 Thread Zhang Yanfei
This patch adds an atomic notifier list named crash_notifier_list.
When loading kvm-intel module, a notifier will be registered in
the list to enable vmcss loaded on all cpus to be VMCLEAR'd if
needed.

Signed-off-by: Zhang Yanfei zhangyan...@cn.fujitsu.com
---
 arch/x86/include/asm/kexec.h |2 ++
 arch/x86/kernel/crash.c  |   25 +
 2 files changed, 27 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 317ff17..5e22b00 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -163,6 +163,8 @@ struct kimage_arch {
 };
 #endif
 
+extern struct atomic_notifier_head crash_notifier_list;
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_X86_KEXEC_H */
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 13ad899..0f3d5b4 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -16,6 +16,8 @@
 #include linux/delay.h
 #include linux/elf.h
 #include linux/elfcore.h
+#include linux/module.h
+#include linux/notifier.h
 
 #include asm/processor.h
 #include asm/hardirq.h
@@ -30,6 +32,19 @@
 
 int in_crash_kexec;
 
+/*
+ * The list is used to VMCLEAR vmcss loaded on all
+ * cpus. And when loading kvm_intel module, the
+ * vmclear function will be registered in the list.
+ */
+ATOMIC_NOTIFIER_HEAD(crash_notifier_list);
+EXPORT_SYMBOL_GPL(crash_notifier_list);
+
+static inline void cpu_emergency_vmclear_loaded_vmcss(void)
+{
+   atomic_notifier_call_chain(crash_notifier_list, 0, NULL);
+}
+
 #if defined(CONFIG_SMP)  defined(CONFIG_X86_LOCAL_APIC)
 
 static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
@@ -46,6 +61,11 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
 #endif
crash_save_cpu(regs, cpu);
 
+   /*
+* VMCLEAR vmcss loaded on all cpus if needed.
+*/
+   cpu_emergency_vmclear_loaded_vmcss();
+
/* Disable VMX or SVM if needed.
 *
 * We need to disable virtualization on all CPUs.
@@ -88,6 +108,11 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
 
kdump_nmi_shootdown_cpus();
 
+   /*
+* VMCLEAR vmcss loaded on this cpu if needed.
+*/
+   cpu_emergency_vmclear_loaded_vmcss();
+
/* Booting kdump kernel with VMX or SVM enabled won't work,
 * because (among other limitations) we can't disable paging
 * with the virt flags.
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 2/2] KVM-INTEL: add a notifier and a bitmap to support VMCLEAR in kdump

2012-11-20 Thread Zhang Yanfei
The notifier will be registered in crash_notifier_list when loading
kvm-intel module. And the bitmap indicates whether we should do
VMCLEAR operation in kdump. The bits in the bitmap are set/unset
according to different conditions.

Signed-off-by: Zhang Yanfei zhangyan...@cn.fujitsu.com
---
 arch/x86/kvm/vmx.c |   85 
 1 files changed, 85 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4ff0ab9..3bbdd75 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -41,6 +41,7 @@
 #include asm/i387.h
 #include asm/xcr.h
 #include asm/perf_event.h
+#include asm/kexec.h
 
 #include trace.h
 
@@ -963,6 +964,30 @@ static void vmcs_load(struct vmcs *vmcs)
   vmcs, phys_addr);
 }
 
+#ifdef CONFIG_KEXEC
+/*
+ * This bitmap is used to indicate whether the vmclear
+ * operation is enabled on all cpus. All disabled by
+ * default.
+ */
+static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
+
+static inline void crash_enable_local_vmclear(int cpu)
+{
+   cpumask_set_cpu(cpu, crash_vmclear_enabled_bitmap);
+}
+
+static inline void crash_disable_local_vmclear(int cpu)
+{
+   cpumask_clear_cpu(cpu, crash_vmclear_enabled_bitmap);
+}
+
+static inline int crash_local_vmclear_enabled(int cpu)
+{
+   return cpumask_test_cpu(cpu, crash_vmclear_enabled_bitmap);
+}
+#endif
+
 static void __loaded_vmcs_clear(void *arg)
 {
struct loaded_vmcs *loaded_vmcs = arg;
@@ -972,8 +997,14 @@ static void __loaded_vmcs_clear(void *arg)
return; /* vcpu migration can race with cpu offline */
if (per_cpu(current_vmcs, cpu) == loaded_vmcs-vmcs)
per_cpu(current_vmcs, cpu) = NULL;
+#ifdef CONFIG_KEXEC
+   crash_disable_local_vmclear(cpu);
+#endif
list_del(loaded_vmcs-loaded_vmcss_on_cpu_link);
loaded_vmcs_init(loaded_vmcs);
+#ifdef CONFIG_KEXEC
+   crash_enable_local_vmclear(cpu);
+#endif
 }
 
 static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
@@ -1491,8 +1522,14 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
local_irq_disable();
+#ifdef CONFIG_KEXEC
+   crash_disable_local_vmclear(cpu);
+#endif
list_add(vmx-loaded_vmcs-loaded_vmcss_on_cpu_link,
 per_cpu(loaded_vmcss_on_cpu, cpu));
+#ifdef CONFIG_KEXEC
+   crash_enable_local_vmclear(cpu);
+#endif
local_irq_enable();
 
/*
@@ -2302,6 +2339,20 @@ static int hardware_enable(void *garbage)
return -EBUSY;
 
INIT_LIST_HEAD(per_cpu(loaded_vmcss_on_cpu, cpu));
+
+#ifdef CONFIG_KEXEC
+   /*
+* Now we can enable the vmclear operation in kdump
+* since the loaded_vmcss_on_cpu list on this cpu
+* has been initialized.
+*
+* Though the cpu is not in VMX operation now, there
+* is no problem to enable the vmclear operation
+* for the loaded_vmcss_on_cpu list is empty!
+*/
+   crash_enable_local_vmclear(cpu);
+#endif
+
rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
 
test_bits = FEATURE_CONTROL_LOCKED;
@@ -2335,6 +2386,22 @@ static void vmclear_local_loaded_vmcss(void)
__loaded_vmcs_clear(v);
 }
 
+#ifdef CONFIG_KEXEC
+static int crash_vmclear_local_loaded_vmcss(struct notifier_block *this,
+   unsigned long val, void *ptr)
+{
+   int cpu = raw_smp_processor_id();
+
+   if (crash_local_vmclear_enabled(cpu))
+   vmclear_local_loaded_vmcss();
+
+   return NOTIFY_DONE;
+}
+
+static struct notifier_block crash_vmclear_notifier = {
+   .notifier_call = crash_vmclear_local_loaded_vmcss,
+};
+#endif
 
 /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
  * tricks.
@@ -2348,6 +2415,14 @@ static void hardware_disable(void *garbage)
 {
if (vmm_exclusive) {
vmclear_local_loaded_vmcss();
+#ifdef CONFIG_KEXEC
+   /*
+* vmclear operation in kdump should be disabled here
+* because the cpu is going to exit VMX operation
+* and the loaded_vmcss_on_cpu list may not be empty!
+*/
+   crash_disable_local_vmclear(raw_smp_processor_id());
+#endif
kvm_cpu_vmxoff();
}
write_cr4(read_cr4()  ~X86_CR4_VMXE);
@@ -7230,6 +7305,11 @@ static int __init vmx_init(void)
if (r)
goto out3;
 
+#ifdef CONFIG_KEXEC
+   atomic_notifier_chain_register(crash_notifier_list,
+  crash_vmclear_notifier);
+#endif
+
vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
@@ -7265,6 +7345,11 @@ static void __exit vmx_exit(void)

Re: [PATCH 2/5] KVM: MMU: simplify mmu_set_spte

2012-11-20 Thread Xiao Guangrong
On 11/21/2012 07:51 AM, Marcelo Tosatti wrote:
 On Wed, Nov 21, 2012 at 07:23:26AM +0800, Xiao Guangrong wrote:
 On 11/21/2012 06:18 AM, Marcelo Tosatti wrote:

 -child = page_header(pte  PT64_BASE_ADDR_MASK);
 -drop_parent_pte(child, sptep);
 -kvm_flush_remote_tlbs(vcpu-kvm);

 How come its safe to drop this case?

 We use if (pfn != spte_to_pfn(*sptep)) to simplify the thing.
 There are two cases:
 1) the sptep is not the last mapping.
under this case, sptep must point to a shadow page table, that means
spte_to_pfn(*sptep)) is used by KVM module, and 'pfn' is used by 
 userspace.
so, 'if' condition must be satisfied, the sptep will be dropped.

Actually, This is the origin case:
   | if (level  PT_PAGE_TABLE_LEVEL 
   |!is_large_pte(*sptep))

 2) the sptep is the last mapping.
under this case, the level of spte (sp.level) must equal the 'level' 
 which
we pass to mmu_set_spte. If they point to the same pfn, it is 'remap', 
 otherwise
we drop it.

 I think this is safe. :)

 mmu_page_zap_pte takes care of it, OK.

 What if was_rmapped=true but gfn is different? Say if the spte comes
 from an unsync shadow page, the guest modifies that shadow page (but
 does not invalidate it with invlpg), then faults. gfn can still point
 to the same gfn (but in that case, with your patch,
 page_header_update_slot is not called.

 Marcelo,

 Page fault path and other sync/prefetch paths will reread guest page table,
 then it get a different target pfn.

 The scenario is like this:

 gfn1 = pfn1, gfn2 = pfn2
 gpte = pfn1, spte is shadowed by gpte and it is a unsync spte

 Guest   Host
  spte = (gfn1, pfn1)

 modify gpte to let it point to gfn2
 spte = (gfn1, pfn1)
 page-fault on gpte
 intercept the page-fault, then
 want to update spte to (gfn2, pfn2)

 in mmu_set_spte, we can detect
 pfn2 != pfn1, then drop it.

 Hmm, the interesting thing is what if different gfns map to the same pfn.
 For example, spte1 is shadowed by gfn1 and spte2 is shadowed by pfn2, both
 gfn1 and gfn2 map to pfn, the code (including the current code) will set
 spte1 to the gfn2's rmap and spte2 to the gfn1's rmap. But i think it is ok.
 
 Current code updates gfn properly in set_spte by
 page_header_update_slot. 
 
 Better keep state properly.

Okay, i will not change the position of page_header_update_slot in the
next version. Thank you, Marcelo!


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v5] kvm/fpu: Enable fully eager restore kvm FPU

2012-11-20 Thread Hao, Xudong
 -Original Message-
 From: Marcelo Tosatti [mailto:mtosa...@redhat.com]
 Sent: Wednesday, November 21, 2012 6:00 AM
 To: Hao, Xudong
 Cc: a...@redhat.com; kvm@vger.kernel.org
 Subject: Re: [PATCH v5] kvm/fpu: Enable fully eager restore kvm FPU
 
 On Wed, Nov 07, 2012 at 10:01:11AM +0800, Xudong Hao wrote:
  Romove fpu lazy restore logic, using eager restore totally.
 
  v5 changes from v4:
  - remove lazy fpu restore totally, fpu eager restore does not have
 performance
  regression and simple the code.
 
  v4 changes from v3:
  - Wrap up some confused code with a clear function lazy_fpu_allowed()
  - Update fpu while update cr4 too.
 
  v3 changes from v2:
  - Make fpu active explicitly while guest xsave is enabling and non-lazy 
  xstate
  bit exist.
 
  v2 changes from v1:
  - Expand KVM_XSTATE_LAZY to 64 bits before negating it.
 
  Signed-off-by: Xudong Hao xudong@intel.com
  ---
   arch/x86/kvm/vmx.c   | 9 ++---
   arch/x86/kvm/x86.c   | 8 +---
   include/linux/kvm_host.h | 1 -
   3 files changed, 3 insertions(+), 15 deletions(-)
 
  diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
  index 6599e45..c1fd2e1 100644
  --- a/arch/x86/kvm/vmx.c
  +++ b/arch/x86/kvm/vmx.c
  @@ -1197,7 +1197,7 @@ static void update_exception_bitmap(struct
 kvm_vcpu *vcpu)
  u32 eb;
 
  eb = (1u  PF_VECTOR) | (1u  UD_VECTOR) | (1u  MC_VECTOR) |
  -(1u  NM_VECTOR) | (1u  DB_VECTOR);
  +(1u  DB_VECTOR);
  if ((vcpu-guest_debug 
   (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
  (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
 
 Please remove the code entirely, including:
 
 if (is_no_device(intr_info)) {
 vmx_fpu_activate(vcpu);
 return 1;
 }
 
 and clts handling.
 
 fpu_active/fpu_deactivate callbacks become unused, don't they?
 Also remove fpu_active variable.

Okay, will remove all of these.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Guest performance is reduced after live migration

2012-11-20 Thread Xiao Guangrong
On 11/21/2012 09:25 AM, shouta.ueh...@jp.yokogawa.com wrote:
 Dear allI
 
 I continue watching a mailing list whether a similar problem is reported 
 because a problem does not seem to happen to others.
 Any information, however small, would be appreciated.
 

I am digging into it, but did not get useful hint so far...

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



Re: [PATCH] vhost-blk: Add vhost-blk support v5

2012-11-20 Thread Asias He
On 11/20/2012 09:37 PM, Michael S. Tsirkin wrote:
 On Tue, Nov 20, 2012 at 02:39:40PM +0800, Asias He wrote:
 On 11/20/2012 04:26 AM, Michael S. Tsirkin wrote:
 On Mon, Nov 19, 2012 at 04:53:42PM +0800, Asias He wrote:
 vhost-blk is an in-kernel virito-blk device accelerator.

 Due to lack of proper in-kernel AIO interface, this version converts
 guest's I/O request to bio and use submit_bio() to submit I/O directly.
 So this version any supports raw block device as guest's disk image,
 e.g. /dev/sda, /dev/ram0. We can add file based image support to
 vhost-blk once we have in-kernel AIO interface. There are some work in
 progress for in-kernel AIO interface from Dave Kleikamp and Zach Brown:

http://marc.info/?l=linux-fsdevelm=133312234313122

 Performance evaluation:
 -
 1) LKVM
 Fio with libaio ioengine on Fusion IO device using kvm tool
 IOPS(k)Before   After   Improvement
 seq-read   107  121 +13.0%
 seq-write  130  179 +37.6%
 rnd-read   102  122 +19.6%
 rnd-write  125  159 +27.0%

 2) QEMU
 Fio with libaio ioengine on Fusion IO device using QEMU
 IOPS(k)Before   After   Improvement
 seq-read   76   123 +61.8%
 seq-write  139  173 +24.4%
 rnd-read   73   120 +64.3%
 rnd-write  75   156 +108.0%

 Could you compare with dataplane qemu as well please?


 Well, I will try to collect it.



 Userspace bits:
 -
 1) LKVM
 The latest vhost-blk userspace bits for kvm tool can be found here:
 g...@github.com:asias/linux-kvm.git blk.vhost-blk

 2) QEMU
 The latest vhost-blk userspace prototype for QEMU can be found here:
 g...@github.com:asias/qemu.git blk.vhost-blk

 Changes in v5:
 - Do not assume the buffer layout
 - Fix wakeup race

 Changes in v4:
 - Mark req-status as userspace pointer
 - Use __copy_to_user() instead of copy_to_user() in vhost_blk_set_status()
 - Add if (need_resched()) schedule() in blk thread
 - Kill vhost_blk_stop_vq() and move it into vhost_blk_stop()
 - Use vq_err() instead of pr_warn()
 - Fail un Unsupported request
 - Add flush in vhost_blk_set_features()

 Changes in v3:
 - Sending REQ_FLUSH bio instead of vfs_fsync, thanks Christoph!
 - Check file passed by user is a raw block device file

 Signed-off-by: Asias He as...@redhat.com

 Since there are files shared by this and vhost net
 it's easiest for me to merge this all through the
 vhost tree.

 Jens, could you ack this and the bio usage in this driver
 please?

 ---
  drivers/vhost/Kconfig |   1 +
  drivers/vhost/Kconfig.blk |  10 +
  drivers/vhost/Makefile|   2 +
  drivers/vhost/blk.c   | 697 
 ++
  drivers/vhost/blk.h   |   8 +
  5 files changed, 718 insertions(+)
  create mode 100644 drivers/vhost/Kconfig.blk
  create mode 100644 drivers/vhost/blk.c
  create mode 100644 drivers/vhost/blk.h

 diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
 index 202bba6..acd8038 100644
 --- a/drivers/vhost/Kconfig
 +++ b/drivers/vhost/Kconfig
 @@ -11,4 +11,5 @@ config VHOST_NET
  
  if STAGING
  source drivers/vhost/Kconfig.tcm
 +source drivers/vhost/Kconfig.blk
  endif
 diff --git a/drivers/vhost/Kconfig.blk b/drivers/vhost/Kconfig.blk
 new file mode 100644
 index 000..ff8ab76
 --- /dev/null
 +++ b/drivers/vhost/Kconfig.blk
 @@ -0,0 +1,10 @@
 +config VHOST_BLK
 +  tristate Host kernel accelerator for virtio blk (EXPERIMENTAL)
 +  depends on BLOCK   EXPERIMENTAL  m
 +  ---help---
 +This kernel module can be loaded in host kernel to accelerate
 +guest block with virtio_blk. Not to be confused with virtio_blk
 +module itself which needs to be loaded in guest kernel.
 +
 +To compile this driver as a module, choose M here: the module will
 +be called vhost_blk.
 diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
 index a27b053..1a8a4a5 100644
 --- a/drivers/vhost/Makefile
 +++ b/drivers/vhost/Makefile
 @@ -2,3 +2,5 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o
  vhost_net-y := vhost.o net.o
  
  obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o
 +obj-$(CONFIG_VHOST_BLK) += vhost_blk.o
 +vhost_blk-y := blk.o
 diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c
 new file mode 100644
 index 000..f0f118a
 --- /dev/null
 +++ b/drivers/vhost/blk.c
 @@ -0,0 +1,697 @@
 +/*
 + * Copyright (C) 2011 Taobao, Inc.
 + * Author: Liu Yuan tailai...@taobao.com
 + *
 + * Copyright (C) 2012 Red Hat, Inc.
 + * Author: Asias He as...@redhat.com
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2.
 + *
 + * virtio-blk server in host kernel.
 + */
 +
 +#include linux/miscdevice.h
 +#include linux/module.h
 +#include linux/vhost.h
 +#include linux/virtio_blk.h
 +#include linux/mutex.h
 +#include linux/file.h
 +#include linux/kthread.h
 +#include linux/blkdev.h
 +#include linux/llist.h
 +
 +#include vhost.c
 +#include vhost.h
 +#include blk.h
 +
 +static 

Re: messed up with xml-files and configuration of a VM

2012-11-20 Thread Stefan Hajnoczi
On Tue, Nov 20, 2012 at 5:13 PM, Lentes, Bernd
bernd.len...@helmholtz-muenchen.de wrote:
 first, i'm new to kvm. I'm running KVM on a sles 11 sp2, kernel 
 3.0.13-0.27-default. My guest is an Ubuntu 12.0.4 LTS 64bit.
 The guest has attached a CDROM, using an iso-file from a CIFS-Share. I 
 detached it with the virtual machine manager (0.9.0).
 I don't see the cd-rom anymore in the virtual machine manager. But when i try 
 to start the vm, it complains about the missing iso-file.
 Why ? I detached it.
 When i like to have a look in the xml-files of the guest, i found three ! One 
 in /var/lib/kvm/images, one in /etc/libvirt/qemu and one in /etc/kvm/vm.
 Which one should i use to configure the vm ? In the one in /etc/libvirt/qemu 
 the cifs-share isn't mentioned any longer, in the other two it is still.
 Is it possible to configure the vm editing one of the XML-files ?
 Or shall i use virsh ? Using virsh, does the vm has to be stopped or can i 
 edit the configuration for a running vm ?
 Why three xml-files ? Why is detaching with the virtual machine manager not 
 working ?

Hi Bernd,
This is a libvirt question, I have CCed the libvirt mailing list.

Do not edit the XML files on disk.  Instead, use virsh edit (to
modify) and virsh dumpxml (to view).

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: qemu-kvm-1.2.0: double free or corruption

2012-11-20 Thread Stefan Hajnoczi
On Mon, Nov 19, 2012 at 8:56 AM, Nikola Ciprich
nikola.cipr...@linuxbox.cz wrote:
 on one of our servers, windows 2008 KVM suddenly crashed. I see following
 in libvirt log:

 *** glibc detected *** /usr/bin/qemu-kvm: double free or corruption (!prev): 
 0x7fc634008cd0 ***
 === Backtrace: =
 /lib64/libc.so.6(+0x75916)[0x7fc9026f4916]
 /lib64/libc.so.6(+0x78443)[0x7fc9026f7443]
 /usr/bin/qemu-kvm(+0x1faeb1)[0x7fc907187eb1]
 /usr/bin/qemu-kvm(+0x1f0e1a)[0x7fc90717de1a]
 /usr/bin/qemu-kvm(+0x1fb681)[0x7fc907188681]
 /usr/bin/qemu-kvm(+0xed6a7)[0x7fc90707a6a7]
 /usr/bin/qemu-kvm(+0x195c31)[0x7fc907122c31]
 /usr/bin/qemu-kvm(main+0x106c)[0x7fc90711e5fc]
 /lib64/libc.so.6(__libc_start_main+0xfd)[0x7fc90269dcdd]
 /usr/bin/qemu-kvm(+0x749f9)[0x7fc9070019f9]
[...]
 I guess this is not of much use, since I didn't have debuginfo package 
 installed
 in time of crash. Is it possible to obtain more debuginfo after I installed 
 it?
 Is there something else I should check to find where the problem could be?

No problem, you can still resolve symbols afterwards.  Download the
debuginfo package and use something along the lines of:
$ addr2line -e /path/to/debug-executable 0x1faeb1 0x1f0e1a 0x1fb681
0xed6a7 0x195c31

It's important to fetch the debuginfo package for the exact same
version of the qemu RPM you were running.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: qemu-kvm-1.2.0: double free or corruption in VNC code

2012-11-20 Thread Nikola Ciprich
Hello Stefan,

thanks! here it goes..

  *** glibc detected *** /usr/bin/qemu-kvm: double free or corruption 
  (!prev): 0x7fc634008cd0 ***
  === Backtrace: =
  /lib64/libc.so.6(+0x75916)[0x7fc9026f4916]
  /lib64/libc.so.6(+0x78443)[0x7fc9026f7443]
  /usr/bin/qemu-kvm(+0x1faeb1)[0x7fc907187eb1]
  /usr/bin/qemu-kvm(+0x1f0e1a)[0x7fc90717de1a]
  /usr/bin/qemu-kvm(+0x1fb681)[0x7fc907188681]
  /usr/bin/qemu-kvm(+0xed6a7)[0x7fc90707a6a7]
  /usr/bin/qemu-kvm(+0x195c31)[0x7fc907122c31]
  /usr/bin/qemu-kvm(main+0x106c)[0x7fc90711e5fc]
  /lib64/libc.so.6(__libc_start_main+0xfd)[0x7fc90269dcdd]
  /usr/bin/qemu-kvm(+0x749f9)[0x7fc9070019f9]
 [...]

[root@blg qemu-kvm-1.2.0]# addr2line -e /usr/lib/debug/usr/bin/qemu-kvm.debug 
0x1faeb1 0x1f0e1a 0x1fb681 0xed6a7 0x195c31 0x106c
/usr/src/debug/qemu-kvm-1.2.0/ui/vnc.c:499
/usr/src/debug/qemu-kvm-1.2.0/ui/vnc-enc-zrle.c:364
/usr/src/debug/qemu-kvm-1.2.0/ui/vnc.c:1037
/usr/src/debug/qemu-kvm-1.2.0/iohandler.c:159
/usr/src/debug/qemu-kvm-1.2.0/main-loop.c:499
??:0

this makes some sense to me, since it crashed while there was VNC
connection active..

 
 It's important to fetch the debuginfo package for the exact same
 version of the qemu RPM you were running.
sure, it's the same version.

BR

nik

-- 
-
Ing. Nikola CIPRICH
LinuxBox.cz, s.r.o.
28.rijna 168, 709 00 Ostrava

tel.:   +420 591 166 214
fax:+420 596 621 273
mobil:  +420 777 093 799
www.linuxbox.cz

mobil servis: +420 737 238 656
email servis: ser...@linuxbox.cz
-


pgpCg25xs1tyP.pgp
Description: PGP signature


[PATCH v2 2/3] KVM: PPC: Book3S HV: Make a HPTE removal function available

2012-11-20 Thread Paul Mackerras
This makes a HPTE removal function, kvmppc_do_h_remove(), available
outside book3s_hv_rm_mmu.c.  This will be used by the HPT writing
code.

Signed-off-by: Paul Mackerras pau...@samba.org
---
v2: basically unchanged from v1, just rediffed

 arch/powerpc/include/asm/kvm_book3s.h |3 +++
 arch/powerpc/kvm/book3s_hv_rm_mmu.c   |   19 +--
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index fea768f..46763d10 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -160,6 +160,9 @@ extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, 
unsigned long flags,
 extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
long pte_index, unsigned long pteh, unsigned long ptel,
pgd_t *pgdir, bool realmode, unsigned long *idx_ret);
+extern long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
+   unsigned long pte_index, unsigned long avpn,
+   unsigned long *hpret);
 extern long kvmppc_hv_get_dirty_log(struct kvm *kvm,
struct kvm_memory_slot *memslot, unsigned long *map);
 
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index a96f90a..2334000 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -365,11 +365,10 @@ static inline int try_lock_tlbie(unsigned int *lock)
return old == 0;
 }
 
-long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
-unsigned long pte_index, unsigned long avpn,
-unsigned long va)
+long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
+   unsigned long pte_index, unsigned long avpn,
+   unsigned long *hpret)
 {
-   struct kvm *kvm = vcpu-kvm;
unsigned long *hpte;
unsigned long v, r, rb;
struct revmap_entry *rev;
@@ -411,10 +410,18 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long 
flags,
note_hpte_modification(kvm, rev);
unlock_hpte(hpte, 0);
 
-   vcpu-arch.gpr[4] = v;
-   vcpu-arch.gpr[5] = r;
+   hpret[0] = v;
+   hpret[1] = r;
return H_SUCCESS;
 }
+EXPORT_SYMBOL_GPL(kvmppc_do_h_remove);
+
+long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
+unsigned long pte_index, unsigned long avpn)
+{
+   return kvmppc_do_h_remove(vcpu-kvm, flags, pte_index, avpn,
+ vcpu-arch.gpr[4]);
+}
 
 long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 {
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 1/3] KVM: PPC: Book3S HV: Add a mechanism for recording modified HPTEs

2012-11-20 Thread Paul Mackerras
This uses a bit in our record of the guest view of the HPTE to record
when the HPTE gets modified.  We use a reserved bit for this, and ensure
that this bit is always cleared in HPTE values returned to the guest.

The recording of modified HPTEs is only done if other code indicates
its interest by setting kvm-arch.hpte_mod_interest to a non-zero value.
The reason for this is that when later commits add facilities for
userspace to read the HPT, the first pass of reading the HPT will be
quicker if there are no (or very few) HPTEs marked as modified,
rather than having most HPTEs marked as modified.

Signed-off-by: Paul Mackerras pau...@samba.org
---
v2: added HPTE_GR_RESERVED, clear those bits in H_ENTER

 arch/powerpc/include/asm/kvm_book3s_64.h |9 +
 arch/powerpc/include/asm/kvm_host.h  |1 +
 arch/powerpc/kvm/book3s_hv_rm_mmu.c  |   28 
 3 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index 1472a5b..b322e5b 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -50,6 +50,15 @@ extern int kvm_hpt_order;/* order of 
preallocated HPTs */
 #define HPTE_V_HVLOCK  0x40UL
 #define HPTE_V_ABSENT  0x20UL
 
+/*
+ * We use this bit in the guest_rpte field of the revmap entry
+ * to indicate a modified HPTE.
+ */
+#define HPTE_GR_MODIFIED   (1ul  62)
+
+/* These bits are reserved in the guest view of the HPTE */
+#define HPTE_GR_RESERVED   HPTE_GR_MODIFIED
+
 static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
 {
unsigned long tmp, old;
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 3093896..58c7264 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -248,6 +248,7 @@ struct kvm_arch {
atomic_t vcpus_running;
unsigned long hpt_npte;
unsigned long hpt_mask;
+   atomic_t hpte_mod_interest;
spinlock_t slot_phys_lock;
unsigned short last_vcpu[NR_CPUS];
struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 362dffe..a96f90a 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -66,6 +66,17 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct 
revmap_entry *rev,
 }
 EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
 
+/*
+ * Note modification of an HPTE; set the HPTE modified bit
+ * if anyone is interested.
+ */
+static inline void note_hpte_modification(struct kvm *kvm,
+ struct revmap_entry *rev)
+{
+   if (atomic_read(kvm-arch.hpte_mod_interest))
+   rev-guest_rpte |= HPTE_GR_MODIFIED;
+}
+
 /* Remove this HPTE from the chain for a real page */
 static void remove_revmap_chain(struct kvm *kvm, long pte_index,
struct revmap_entry *rev,
@@ -138,7 +149,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
unsigned long slot_fn, hva;
unsigned long *hpte;
struct revmap_entry *rev;
-   unsigned long g_ptel = ptel;
+   unsigned long g_ptel;
struct kvm_memory_slot *memslot;
unsigned long *physp, pte_size;
unsigned long is_io;
@@ -153,6 +164,8 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
return H_PARAMETER;
writing = hpte_is_writable(ptel);
pteh = ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
+   ptel = ~HPTE_GR_RESERVED;
+   g_ptel = ptel;
 
/* used later to detect if we might have been invalidated */
mmu_seq = kvm-mmu_notifier_seq;
@@ -287,8 +300,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long 
flags,
rev = kvm-arch.revmap[pte_index];
if (realmode)
rev = real_vmalloc_addr(rev);
-   if (rev)
+   if (rev) {
rev-guest_rpte = g_ptel;
+   note_hpte_modification(kvm, rev);
+   }
 
/* Link HPTE into reverse-map chain */
if (pteh  HPTE_V_VALID) {
@@ -392,7 +407,8 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long 
flags,
/* Read PTE low word after tlbie to get final R/C values */
remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]);
}
-   r = rev-guest_rpte;
+   r = rev-guest_rpte  ~HPTE_GR_RESERVED;
+   note_hpte_modification(kvm, rev);
unlock_hpte(hpte, 0);
 
vcpu-arch.gpr[4] = v;
@@ -466,6 +482,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 
args[j] = ((0x80 | flags)  56) + pte_index;
rev = real_vmalloc_addr(kvm-arch.revmap[pte_index]);
+   note_hpte_modification(kvm, rev);
 
if (!(hp[0]  HPTE_V_VALID)) {
 

[PATCH v2 3/3] KVM: PPC: Book3S HV: Provide a method for userspace to read and write the HPT

2012-11-20 Thread Paul Mackerras
A new ioctl, KVM_PPC_GET_HTAB_FD, returns a file descriptor.  Reads on
this fd return the contents of the HPT (hashed page table), writes
create and/or remove entries in the HPT.  There is a new capability,
KVM_CAP_PPC_HTAB_FD, to indicate the presence of the ioctl.  The ioctl
takes an argument structure with the index of the first HPT entry to
read out and a set of flags.  The flags indicate whether the user is
intending to read or write the HPT, and whether to return all entries
or only the bolted entries (those with the bolted bit, 0x10, set in
the first doubleword).

This is intended for use in implementing qemu's savevm/loadvm and for
live migration.  Therefore, on reads, the first pass returns information
about all HPTEs (or all bolted HPTEs).  When the first pass reaches the
end of the HPT, it returns from the read.  Subsequent reads only return
information about HPTEs that have changed since they were last read.
A read that finds no changed HPTEs in the HPT following where the last
read finished will return 0 bytes.

The format of the data provides a simple run-length compression of the
invalid entries.  Each block of data starts with a header that indicates
the index (position in the HPT, which is just an array), the number of
valid entries starting at that index (may be zero), and the number of
invalid entries following those valid entries.  The valid entries, 16
bytes each, follow the header.  The invalid entries are not explicitly
represented.

Signed-off-by: Paul Mackerras pau...@samba.org
---
v2: added comments, added reserved field in struct kvm_get_htab_fd

 Documentation/virtual/kvm/api.txt|   53 +
 arch/powerpc/include/asm/kvm_book3s_64.h |   22 ++
 arch/powerpc/include/asm/kvm_ppc.h   |2 +
 arch/powerpc/include/uapi/asm/kvm.h  |   25 +++
 arch/powerpc/kvm/book3s_64_mmu_hv.c  |  344 ++
 arch/powerpc/kvm/book3s_hv.c |   12 --
 arch/powerpc/kvm/powerpc.c   |   17 ++
 include/uapi/linux/kvm.h |3 +
 8 files changed, 466 insertions(+), 12 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 6671fdc..33080ea 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2071,6 +2071,59 @@ KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; 
source cpu in parm
 
 Note that the vcpu ioctl is asynchronous to vcpu execution.
 
+4.78 KVM_PPC_GET_HTAB_FD
+
+Capability: KVM_CAP_PPC_HTAB_FD
+Architectures: powerpc
+Type: vm ioctl
+Parameters: Pointer to struct kvm_get_htab_fd (in)
+Returns: file descriptor number (= 0) on success, -1 on error
+
+This returns a file descriptor that can be used either to read out the
+entries in the guest's hashed page table (HPT), or to write entries to
+initialize the HPT.  The returned fd can only be written to if the
+KVM_GET_HTAB_WRITE bit is set in the flags field of the argument, and
+can only be read if that bit is clear.  The argument struct looks like
+this:
+
+/* For KVM_PPC_GET_HTAB_FD */
+struct kvm_get_htab_fd {
+   __u64   flags;
+   __u64   start_index;
+};
+
+/* Values for kvm_get_htab_fd.flags */
+#define KVM_GET_HTAB_BOLTED_ONLY   ((__u64)0x1)
+#define KVM_GET_HTAB_WRITE ((__u64)0x2)
+
+The `start_index' field gives the index in the HPT of the entry at
+which to start reading.  It is ignored when writing.
+
+Reads on the fd will initially supply information about all
+interesting HPT entries.  Interesting entries are those with the
+bolted bit set, if the KVM_GET_HTAB_BOLTED_ONLY bit is set, otherwise
+all entries.  When the end of the HPT is reached, the read() will
+return.  If read() is called again on the fd, it will start again from
+the beginning of the HPT, but will only return HPT entries that have
+changed since they were last read.
+
+Data read or written is structured as a header (8 bytes) followed by a
+series of valid HPT entries (16 bytes) each.  The header indicates how
+many valid HPT entries there are and how many invalid entries follow
+the valid entries.  The invalid entries are not represented explicitly
+in the stream.  The header format is:
+
+struct kvm_get_htab_header {
+   __u32   index;
+   __u16   n_valid;
+   __u16   n_invalid;
+};
+
+Writes to the fd create HPT entries starting at the index given in the
+header; first `n_valid' valid entries with contents from the data
+written, then `n_invalid' invalid entries, invalidating any previously
+valid entries found.
+
 
 5. The kvm_run structure
 
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index b322e5b..38bec1d 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -246,4 +246,26 @@ static inline bool slot_is_aligned(struct kvm_memory_slot 
*memslot,
return !(memslot-base_gfn  mask)  !(memslot-npages  mask);
 }
 
+/*
+ * This 

[PATCH] KVM: PPC: Book3S HV: Fix bug causing loss of page dirty state

2012-11-20 Thread Paul Mackerras
This fixes a bug where adding a new guest HPT entry via the H_ENTER
hcall would lose the changed bit in the reverse map information
for the guest physical page being mapped.  The result was that the
KVM_GET_DIRTY_LOG could return a zero bit for the page even though
the page had been modified by the guest.

This fixes it by only modifying the index and present bits in the
reverse map entry, thus preserving the reference and change bits.
We were also unnecessarily setting the reference bit, and this
fixes that too.

Signed-off-by: Paul Mackerras pau...@samba.org
---
This is against Alex Graf's kvm-ppc-next branch plus the series of three
patches I just sent, but it should be independent of that series.

 arch/powerpc/kvm/book3s_hv_rm_mmu.c |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 2334000..fc3da32 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -59,10 +59,10 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct 
revmap_entry *rev,
head-back = pte_index;
} else {
rev-forw = rev-back = pte_index;
-   i = pte_index;
+   *rmap = (*rmap  ~KVMPPC_RMAP_INDEX) |
+   pte_index | KVMPPC_RMAP_PRESENT;
}
-   smp_wmb();
-   *rmap = i | KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT; /* unlock */
+   unlock_rmap(rmap);
 }
 EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: PPC: Book3S HV: Fix bug causing loss of page dirty state

2012-11-20 Thread Alexander Graf

On 20.11.2012, at 10:01, Paul Mackerras wrote:

 This fixes a bug where adding a new guest HPT entry via the H_ENTER
 hcall would lose the changed bit in the reverse map information
 for the guest physical page being mapped.  The result was that the
 KVM_GET_DIRTY_LOG could return a zero bit for the page even though
 the page had been modified by the guest.
 
 This fixes it by only modifying the index and present bits in the
 reverse map entry, thus preserving the reference and change bits.
 We were also unnecessarily setting the reference bit, and this
 fixes that too.
 
 Signed-off-by: Paul Mackerras pau...@samba.org

Thanks, applied to kvm-ppc-next.

Alex

 ---
 This is against Alex Graf's kvm-ppc-next branch plus the series of three
 patches I just sent, but it should be independent of that series.
 
 arch/powerpc/kvm/book3s_hv_rm_mmu.c |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
 
 diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
 b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
 index 2334000..fc3da32 100644
 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
 +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
 @@ -59,10 +59,10 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct 
 revmap_entry *rev,
   head-back = pte_index;
   } else {
   rev-forw = rev-back = pte_index;
 - i = pte_index;
 + *rmap = (*rmap  ~KVMPPC_RMAP_INDEX) |
 + pte_index | KVMPPC_RMAP_PRESENT;
   }
 - smp_wmb();
 - *rmap = i | KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT; /* unlock */
 + unlock_rmap(rmap);
 }
 EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
 
 -- 
 1.7.10.4
 
 --
 To unsubscribe from this list: send the line unsubscribe kvm-ppc in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 3/3] KVM: PPC: Book3S HV: Provide a method for userspace to read and write the HPT

2012-11-20 Thread Alexander Graf

On 20.11.2012, at 09:57, Paul Mackerras wrote:

 A new ioctl, KVM_PPC_GET_HTAB_FD, returns a file descriptor.  Reads on
 this fd return the contents of the HPT (hashed page table), writes
 create and/or remove entries in the HPT.  There is a new capability,
 KVM_CAP_PPC_HTAB_FD, to indicate the presence of the ioctl.  The ioctl
 takes an argument structure with the index of the first HPT entry to
 read out and a set of flags.  The flags indicate whether the user is
 intending to read or write the HPT, and whether to return all entries
 or only the bolted entries (those with the bolted bit, 0x10, set in
 the first doubleword).
 
 This is intended for use in implementing qemu's savevm/loadvm and for
 live migration.  Therefore, on reads, the first pass returns information
 about all HPTEs (or all bolted HPTEs).  When the first pass reaches the
 end of the HPT, it returns from the read.  Subsequent reads only return
 information about HPTEs that have changed since they were last read.
 A read that finds no changed HPTEs in the HPT following where the last
 read finished will return 0 bytes.
 
 The format of the data provides a simple run-length compression of the
 invalid entries.  Each block of data starts with a header that indicates
 the index (position in the HPT, which is just an array), the number of
 valid entries starting at that index (may be zero), and the number of
 invalid entries following those valid entries.  The valid entries, 16
 bytes each, follow the header.  The invalid entries are not explicitly
 represented.
 
 Signed-off-by: Paul Mackerras pau...@samba.org
 ---
 v2: added comments, added reserved field in struct kvm_get_htab_fd
 
 Documentation/virtual/kvm/api.txt|   53 +
 arch/powerpc/include/asm/kvm_book3s_64.h |   22 ++
 arch/powerpc/include/asm/kvm_ppc.h   |2 +
 arch/powerpc/include/uapi/asm/kvm.h  |   25 +++
 arch/powerpc/kvm/book3s_64_mmu_hv.c  |  344 ++
 arch/powerpc/kvm/book3s_hv.c |   12 --
 arch/powerpc/kvm/powerpc.c   |   17 ++
 include/uapi/linux/kvm.h |3 +
 8 files changed, 466 insertions(+), 12 deletions(-)
 
 diff --git a/Documentation/virtual/kvm/api.txt 
 b/Documentation/virtual/kvm/api.txt
 index 6671fdc..33080ea 100644
 --- a/Documentation/virtual/kvm/api.txt
 +++ b/Documentation/virtual/kvm/api.txt
 @@ -2071,6 +2071,59 @@ KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external 
 call; source cpu in parm
 
 Note that the vcpu ioctl is asynchronous to vcpu execution.
 
 +4.78 KVM_PPC_GET_HTAB_FD
 +
 +Capability: KVM_CAP_PPC_HTAB_FD
 +Architectures: powerpc
 +Type: vm ioctl
 +Parameters: Pointer to struct kvm_get_htab_fd (in)
 +Returns: file descriptor number (= 0) on success, -1 on error
 +
 +This returns a file descriptor that can be used either to read out the
 +entries in the guest's hashed page table (HPT), or to write entries to
 +initialize the HPT.  The returned fd can only be written to if the
 +KVM_GET_HTAB_WRITE bit is set in the flags field of the argument, and
 +can only be read if that bit is clear.  The argument struct looks like
 +this:
 +
 +/* For KVM_PPC_GET_HTAB_FD */
 +struct kvm_get_htab_fd {
 + __u64   flags;
 + __u64   start_index;

Documentation is out of sync :).

Applied all 3 with fixed documentation.


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html