date:20130425

[PATCH 11/11] nEPT: Provide the correct exit qualification upon EPT

2013-04-25 Thread Jun Nakajima

Save [2:0] of exit qualificaiton at EPT violation, and use the information when 
injecting EPT violation.

Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/include/asm/kvm_host.h | 2 ++
 arch/x86/kvm/paging_tmpl.h  | 5 +
 arch/x86/kvm/vmx.c  | 3 +++
 3 files changed, 10 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4979778..e029bba 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -504,6 +504,8 @@ struct kvm_vcpu_arch {
 * instruction.
 */
bool write_fault_to_shadow_pgtable;
+
+   unsigned long exit_qualification; /* set at EPT violation at this point 
*/
 };
 
 struct kvm_lpage_info {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index e13b6c5..bd370e7 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -349,7 +349,12 @@ error:
 
walker->fault.vector = PF_VECTOR;
walker->fault.error_code_valid = true;
+#if PTTYPE != PTTYPE_EPT
walker->fault.error_code = errcode;
+#else
+   /* Reuse bits [2:0] of EPT violation */
+   walker->fault.error_code = vcpu->arch.exit_qualification & 0x7;
+#endif
walker->fault.address = addr;
walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 95304cc..61e2853 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -425,6 +425,7 @@ struct vcpu_vmx {
ktime_t entry_time;
s64 vnmi_blocked_time;
u32 exit_reason;
+   unsigned long exit_qualification;
 
bool rdtscp_enabled;
 
@@ -5074,6 +5075,8 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
/* ept page table is present? */
error_code |= (exit_qualification >> 3) & 0x1;
 
+vcpu->arch.exit_qualification = exit_qualification;
+
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }
 
-- 
1.8.2.1.610.g562af5b

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 10/11] nEPT: Miscelleneous cleanups

2013-04-25 Thread Jun Nakajima

Some trivial code cleanups not really related to nested EPT.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/vmx.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c67eb06..95304cc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -616,7 +616,6 @@ static void nested_release_page_clean(struct page *page)
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -912,8 +911,7 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, 
u32 bit)
(vmcs12->secondary_vm_exec_control & bit);
 }
 
-static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
-   struct kvm_vcpu *vcpu)
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
 {
return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
 }
@@ -6321,7 +6319,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 
if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
!(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
-   get_vmcs12(vcpu), vcpu {
+   get_vmcs12(vcpu) {
if (vmx_interrupt_allowed(vcpu)) {
vmx->soft_vnmi_blocked = 0;
} else if (vmx->vnmi_blocked_time > 10LL &&
-- 
1.8.2.1.610.g562af5b

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 09/11] nEPT: Documentation

2013-04-25 Thread Jun Nakajima

Update the documentation to no longer say that nested EPT is not supported.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 Documentation/virtual/kvm/nested-vmx.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/virtual/kvm/nested-vmx.txt 
b/Documentation/virtual/kvm/nested-vmx.txt
index 8ed937d..cdf7839 100644
--- a/Documentation/virtual/kvm/nested-vmx.txt
+++ b/Documentation/virtual/kvm/nested-vmx.txt
@@ -38,8 +38,8 @@ The current code supports running Linux guests under KVM 
guests.
 Only 64-bit guest hypervisors are supported.
 
 Additional patches for running Windows under guest KVM, and Linux under
-guest VMware server, and support for nested EPT, are currently running in
-the lab, and will be sent as follow-on patchsets.
+guest VMware server, are currently running in the lab, and will be sent as
+follow-on patchsets.
 
 
 Running nested VMX
-- 
1.8.2.1.610.g562af5b

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 08/11] nEPT: Nested INVEPT

2013-04-25 Thread Jun Nakajima

If we let L1 use EPT, we should probably also support the INVEPT instruction.

In our current nested EPT implementation, when L1 changes its EPT table for
L2 (i.e., EPT12), L0 modifies the shadow EPT table (EPT02), and in the course
of this modification already calls INVEPT. Therefore, when L1 calls INVEPT,
we don't really need to do anything. In particular we *don't* need to call
the real INVEPT again. All we do in our INVEPT is verify the validity of the
call, and its parameters, and then do nothing.

In KVM Forum 2010, Dong et al. presented "Nested Virtualization Friendly KVM"
and classified our current nested EPT implementation as "shadow-like virtual
EPT". He recommended instead a different approach, which he called "VTLB-like
virtual EPT". If we had taken that alternative approach, INVEPT would have had
a bigger role: L0 would only rebuild the shadow EPT table when L1 calls INVEPT.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/include/asm/vmx.h  |  4 +-
 arch/x86/include/uapi/asm/vmx.h |  1 +
 arch/x86/kvm/vmx.c  | 83 +
 3 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index b6fbf86..0ce54f3 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -376,7 +376,9 @@ enum vmcs_field {
 #define VMX_EPTP_WB_BIT(1ull << 14)
 #define VMX_EPT_2MB_PAGE_BIT   (1ull << 16)
 #define VMX_EPT_1GB_PAGE_BIT   (1ull << 17)
-#define VMX_EPT_AD_BIT (1ull << 21)
+#define VMX_EPT_INVEPT_BIT (1ull << 20)
+#define VMX_EPT_AD_BIT (1ull << 21)
+#define VMX_EPT_EXTENT_INDIVIDUAL_BIT  (1ull << 24)
 #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT  (1ull << 26)
 
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 2871fcc..5662cef 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,7 @@
 #define EXIT_REASON_EOI_INDUCED 45
 #define EXIT_REASON_EPT_VIOLATION   48
 #define EXIT_REASON_EPT_MISCONFIG   49
+#define EXIT_REASON_INVEPT 50
 #define EXIT_REASON_WBINVD  54
 #define EXIT_REASON_XSETBV  55
 #define EXIT_REASON_APIC_WRITE  56
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 76df3a8..c67eb06 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5879,6 +5879,87 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
return 1;
 }
 
+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+   u32 vmx_instruction_info;
+   unsigned long type;
+   gva_t gva;
+   struct x86_exception e;
+   struct {
+   u64 eptp, gpa;
+   } operand;
+
+   if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
+   !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 1;
+   }
+
+   if (!nested_vmx_check_permission(vcpu))
+   return 1;
+
+   if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 1;
+   }
+
+   /* According to the Intel VMX instruction reference, the memory
+* operand is read even if it isn't needed (e.g., for type==global)
+*/
+   vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+   if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+   vmx_instruction_info, &gva))
+   return 1;
+   if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
+   sizeof(operand), &e)) {
+   kvm_inject_page_fault(vcpu, &e);
+   return 1;
+   }
+
+   type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+   switch (type) {
+   case VMX_EPT_EXTENT_GLOBAL:
+   if (!(nested_vmx_ept_caps & VMX_EPT_EXTENT_GLOBAL_BIT))
+   nested_vmx_failValid(vcpu,
+   VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+   else {
+   /*
+* Do nothing: when L1 changes EPT12, we already
+* update EPT02 (the shadow EPT table) and call INVEPT.
+* So when L1 calls INVEPT, there's nothing left to do.
+*/
+   nested_vmx_succeed(vcpu);
+   }
+   break;
+   case VMX_EPT_EXTENT_CONTEXT:
+   if (!(nested_vmx_ept_caps & VMX_EPT_EXTENT_CONTEXT_BIT))
+   nested_vmx_failValid(vcpu,
+   VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);

[PATCH 07/11] nEPT: Advertise EPT to L1

2013-04-25 Thread Jun Nakajima

Advertise the support of EPT to the L1 guest, through the appropriate MSR.

This is the last patch of the basic Nested EPT feature, so as to allow
bisection through this patch series: The guest will not see EPT support until
this last patch, and will not attempt to use the half-applied feature.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/vmx.c | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 66ead51..76df3a8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2027,6 +2027,7 @@ static u32 nested_vmx_secondary_ctls_low, 
nested_vmx_secondary_ctls_high;
 static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
+static u32 nested_vmx_ept_caps;
 static __init void nested_vmx_setup_ctls_msrs(void)
 {
/*
@@ -2102,6 +2103,18 @@ static __init void nested_vmx_setup_ctls_msrs(void)
nested_vmx_secondary_ctls_low = 0;
nested_vmx_secondary_ctls_high &=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+   if (enable_ept) {
+   /* nested EPT: emulate EPT also to L1 */
+   nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
+   nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT;
+   nested_vmx_ept_caps |=
+   VMX_EPT_INVEPT_BIT | VMX_EPT_EXTENT_GLOBAL_BIT |
+   VMX_EPT_EXTENT_CONTEXT_BIT |
+   VMX_EPT_EXTENT_INDIVIDUAL_BIT;
+   nested_vmx_ept_caps &= vmx_capability.ept;
+   } else
+   nested_vmx_ept_caps = 0;
+
 }
 
 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
@@ -2201,8 +2214,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 
msr_index, u64 *pdata)
nested_vmx_secondary_ctls_high);
break;
case MSR_IA32_VMX_EPT_VPID_CAP:
-   /* Currently, no nested ept or nested vpid */
-   *pdata = 0;
+   /* Currently, no nested vpid support */
+   *pdata = nested_vmx_ept_caps;
break;
default:
return 0;
-- 
1.8.2.1.610.g562af5b

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 06/11] nEPT: Some additional comments

2013-04-25 Thread Jun Nakajima

Some additional comments to preexisting code:
Explain who (L0 or L1) handles EPT violation and misconfiguration exits.
Don't mention "shadow on either EPT or shadow" as the only two options.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/vmx.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 26a1b6f..66ead51 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6127,7 +6127,20 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu 
*vcpu)
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
case EXIT_REASON_EPT_VIOLATION:
+   /*
+* L0 always deals with the EPT violation. If nested EPT is
+* used, and the nested mmu code discovers that the address is
+* missing in the guest EPT table (EPT12), the EPT violation
+* will be injected with nested_ept_inject_page_fault()
+*/
+   return 0;
case EXIT_REASON_EPT_MISCONFIG:
+   /*
+* L2 never uses directly L1's EPT, but rather L0's own EPT
+* table (shadow on EPT) or a merged EPT table that L0 built
+* (EPT on EPT). So any problems with the structure of the
+* table is L0's fault.
+*/
return 0;
case EXIT_REASON_WBINVD:
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
-- 
1.8.2.1.610.g562af5b

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 05/11] nEPT: Fix wrong test in kvm_set_cr3

2013-04-25 Thread Jun Nakajima

kvm_set_cr3() attempts to check if the new cr3 is a valid guest physical
address. The problem is that with nested EPT, cr3 is an *L2* physical
address, not an L1 physical address as this test expects.

As the comment above this test explains, it isn't necessary, and doesn't
correspond to anything a real processor would do. So this patch removes it.

Note that this wrong test could have also theoretically caused problems
in nested NPT, not just in nested EPT. However, in practice, the problem
was avoided: nested_svm_vmexit()/vmrun() do not call kvm_set_cr3 in the
nested NPT case, and instead set the vmcb (and arch.cr3) directly, thus
circumventing the problem. Additional potential calls to the buggy function
are avoided in that we don't trap cr3 modifications when nested NPT is
enabled. However, because in nested VMX we did want to use kvm_set_cr3()
(as requested in Avi Kivity's review of the original nested VMX patches),
we can't avoid this problem and need to fix it.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/x86.c | 11 ---
 1 file changed, 11 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e172132..c34590d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -659,17 +659,6 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 */
}
 
-   /*
-* Does the new cr3 value map to physical memory? (Note, we
-* catch an invalid cr3 even in real-mode, because it would
-* cause trouble later on when we turn on paging anyway.)
-*
-* A real CPU would silently accept an invalid cr3 and would
-* attempt to use it - with largely undefined (and often hard
-* to debug) behavior on the guest side.
-*/
-   if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
-   return 1;
vcpu->arch.cr3 = cr3;
__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
vcpu->arch.mmu.new_cr3(vcpu);
-- 
1.8.2.1.610.g562af5b

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 04/11] nEPT: Fix cr3 handling in nested exit and entry

2013-04-25 Thread Jun Nakajima

The existing code for handling cr3 and related VMCS fields during nested
exit and entry wasn't correct in all cases:

If L2 is allowed to control cr3 (and this is indeed the case in nested EPT),
during nested exit we must copy the modified cr3 from vmcs02 to vmcs12, and
we forgot to do so. This patch adds this copy.

If L0 isn't controlling cr3 when running L2 (i.e., L0 is using EPT), and
whoever does control cr3 (L1 or L2) is using PAE, the processor might have
saved PDPTEs and we should also save them in vmcs12 (and restore later).

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/vmx.c | 37 -
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6ab53ca..26a1b6f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7163,10 +7163,26 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, 
struct vmcs12 *vmcs12)
vmx_set_cr4(vcpu, vmcs12->guest_cr4);
vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
 
-   /* shadow page tables on either EPT or shadow page tables */
+   /*
+* Note that kvm_set_cr3() and kvm_mmu_reset_context() will do the
+* right thing, and set GUEST_CR3 and/or EPT_POINTER in all supported
+* settings: 1. shadow page tables on shadow page tables, 2. shadow
+* page tables on EPT, 3. EPT on EPT.
+*/
kvm_set_cr3(vcpu, vmcs12->guest_cr3);
kvm_mmu_reset_context(vcpu);
 
+   /*
+* Additionally, except when L0 is using shadow page tables, L1 or
+* L2 control guest_cr3 for L2, so they may also have saved PDPTEs
+*/
+   if (enable_ept) {
+   vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+   vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+   vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+   vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+   }
+
kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
 }
@@ -7398,6 +7414,25 @@ void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 
*vmcs12)
vmcs12->guest_pending_dbg_exceptions =
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
 
+   /*
+* In some cases (usually, nested EPT), L2 is allowed to change its
+* own CR3 without exiting. If it has changed it, we must keep it.
+* Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
+* by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
+*/
+   if (enable_ept)
+   vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3);
+   /*
+* Additionally, except when L0 is using shadow page tables, L1 or
+* L2 control guest_cr3 for L2, so save their PDPTEs
+*/
+   if (enable_ept) {
+   vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+   vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+   vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+   vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+   }
+
/* TODO: These cannot have changed unless we have MSR bitmaps and
 * the relevant bit asks not to trap the change */
vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
-- 
1.8.2.1.610.g562af5b

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 03/11] nEPT: MMU context for nested EPT

2013-04-25 Thread Jun Nakajima

KVM's existing shadow MMU code already supports nested TDP. To use it, we
need to set up a new "MMU context" for nested EPT, and create a few callbacks
for it (nested_ept_*()). This context should also use the EPT versions of
the page table access functions (defined in the previous patch).
Then, we need to switch back and forth between this nested context and the
regular MMU context when switching between L1 and L2 (when L1 runs this L2
with EPT).

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/mmu.c | 38 ++
 arch/x86/kvm/mmu.h |  1 +
 arch/x86/kvm/vmx.c | 53 -
 3 files changed, 91 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cb9c6fd..99bfc5e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3644,6 +3644,44 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct 
kvm_mmu *context)
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
+int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+{
+   ASSERT(vcpu);
+   ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+   context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+
+   context->nx = is_nx(vcpu); /* TODO: ? */
+   context->new_cr3 = paging_new_cr3;
+   context->page_fault = EPT_page_fault;
+   context->gva_to_gpa = EPT_gva_to_gpa;
+   context->sync_page = EPT_sync_page;
+   context->invlpg = EPT_invlpg;
+   context->update_pte = EPT_update_pte;
+   context->free = paging_free;
+   context->root_level = context->shadow_root_level;
+   context->root_hpa = INVALID_PAGE;
+   context->direct_map = false;
+
+   /* TODO: reset_rsvds_bits_mask() is not built for EPT, we need
+  something different.
+*/
+   reset_rsvds_bits_mask(vcpu, context);
+
+
+   /* TODO: I copied these from kvm_init_shadow_mmu, I don't know why
+  they are done, or why they write to vcpu->arch.mmu and not context
+*/
+   vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
+   vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
+   vcpu->arch.mmu.base_role.smep_andnot_wp =
+   kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) &&
+   !is_write_protection(vcpu);
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_EPT_mmu);
+
 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 6987108..19dd5ab 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -54,6 +54,7 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 
addr, u64 sptes[4]);
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool 
direct);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9e0ec9d..6ab53ca 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -918,6 +918,11 @@ static inline bool nested_cpu_has_virtual_nmis(struct 
vmcs12 *vmcs12,
return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
 }
 
+static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
+{
+   return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -6873,6 +6878,46 @@ static void vmx_set_supported_cpuid(u32 func, struct 
kvm_cpuid_entry2 *entry)
entry->ecx |= bit(X86_FEATURE_VMX);
 }
 
+/* Callbacks for nested_ept_init_mmu_context: */
+
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
+{
+   /* return the page table to be shadowed - in our case, EPT12 */
+   return get_vmcs12(vcpu)->ept_pointer;
+}
+
+static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
+   struct x86_exception *fault)
+{
+   struct vmcs12 *vmcs12;
+   nested_vmx_vmexit(vcpu);
+   vmcs12 = get_vmcs12(vcpu);
+   /*
+* Note no need to set vmcs12->vm_exit_reason as it is already copied
+* from vmcs02 in nested_vmx_vmexit() above, i.e., EPT_VIOLATION.
+*/
+   vmcs12->exit_qualification = fault->error_code;
+   vmcs12->guest_physical_address = fault->address;
+}
+
+static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+   int r = kvm_init_shadow_EPT_mmu(vcpu, &vcpu->arch.mmu);
+
+   vcpu->arch.mmu.set_cr3   = vmx_set_cr3;
+   vcpu->arch.mmu.get_cr3   = nested_ept_get_cr3;
+   vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
+
+   vcpu->arch.walk_mmu  = &vcpu->arch.nested_mm

[PATCH 02/11] nEPT: Add EPT tables support to paging_tmpl.h

2013-04-25 Thread Jun Nakajima

This is the first patch in a series which adds nested EPT support to KVM's
nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
to set its own cr3 and take its own page faults without either of L0 or L1
getting involved. This often significanlty improves L2's performance over the
previous two alternatives (shadow page tables over EPT, and shadow page
tables over shadow page tables).

This patch adds EPT support to paging_tmpl.h.

paging_tmpl.h contains the code for reading and writing page tables. The code
for 32-bit and 64-bit tables is very similar, but not identical, so
paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
with PTTYPE=64, and this generates the two sets of similar functions.

There are subtle but important differences between the format of EPT tables
and that of ordinary x86 64-bit page tables, so for nested EPT we need a
third set of functions to read the guest EPT table and to write the shadow
EPT table.

So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
with "EPT") which correctly read and write EPT tables.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/mmu.c |  35 ++--
 arch/x86/kvm/paging_tmpl.h | 133 ++---
 2 files changed, 130 insertions(+), 38 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 956ca35..cb9c6fd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2480,26 +2480,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu 
*vcpu, gfn_t gfn,
return gfn_to_pfn_memslot_atomic(slot, gfn);
 }
 
-static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *spte,
- u64 gpte)
-{
-   if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
-   goto no_present;
-
-   if (!is_present_gpte(gpte))
-   goto no_present;
-
-   if (!(gpte & PT_ACCESSED_MASK))
-   goto no_present;
-
-   return false;
-
-no_present:
-   drop_spte(vcpu->kvm, spte);
-   return true;
-}
-
 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
u64 *start, u64 *end)
@@ -3399,16 +3379,6 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, 
unsigned access,
return false;
 }
 
-static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
-{
-   unsigned access;
-
-   access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-   access &= ~(gpte >> PT64_NX_SHIFT);
-
-   return access;
-}
-
 static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned 
gpte)
 {
unsigned index;
@@ -3418,6 +3388,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, 
unsigned level, unsigned gp
return mmu->last_pte_bitmap & (1 << index);
 }
 
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
+
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 105dd5b..e13b6c5 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -50,6 +50,22 @@
#define PT_LEVEL_BITS PT32_LEVEL_BITS
#define PT_MAX_FULL_LEVELS 2
#define CMPXCHG cmpxchg
+#elif PTTYPE == PTTYPE_EPT
+   #define pt_element_t u64
+   #define guest_walker guest_walkerEPT
+   #define FNAME(name) EPT_##name
+   #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+   #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+   #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+   #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+   #define PT_LEVEL_BITS PT64_LEVEL_BITS
+   #ifdef CONFIG_X86_64
+   #define PT_MAX_FULL_LEVELS 4
+   #define CMPXCHG cmpxchg
+   #else
+   #define CMPXCHG cmpxchg64
+   #define PT_MAX_FULL_LEVELS 2
+   #endif
 #else
#error Invalid PTTYPE value
 #endif
@@ -80,6 +96,7 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
 }
 
+#if PTTYPE != PTTYPE_EPT
 static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
   pt_element_t __user *ptep_user, unsigned index,
   pt_element_t orig_pte, pt_element_t new_pte)
@@ -102,7 +119,52 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
 
return (ret != orig_pte);
 }
+#endif
+
+static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+   unsigned access;
+
+#if PTTYPE == PTTYPE_EPT
+   /* We rely here that ACC_WRITE_MASK==VMX_EPT_WRITABLE_MASK */
+

[PATCH 01/11] nEPT: Support LOAD_IA32_EFER entry/exit controls for L1

2013-04-25 Thread Jun Nakajima

Recent KVM, since http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577
switch the EFER MSR when EPT is used and the host and guest have different
NX bits. So if we add support for nested EPT (L1 guest using EPT to run L2)
and want to be able to run recent KVM as L1, we need to allow L1 to use this
EFER switching feature.

To do this EFER switching, KVM uses VM_ENTRY/EXIT_LOAD_IA32_EFER if available,
and if it isn't, it uses the generic VM_ENTRY/EXIT_MSR_LOAD. This patch adds
support for the former (the latter is still unsupported).

Nested entry and exit emulation (prepare_vmcs_02 and load_vmcs12_host_state,
respectively) already handled VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So all
that's left to do in this patch is to properly advertise this feature to L1.

Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are emulated by L0, by using
vmx_set_efer (which itself sets one of several vmcs02 fields), so we always
support this feature, regardless of whether the host supports it.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/vmx.c | 18 ++
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6667042..9e0ec9d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2057,6 +2057,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 #else
nested_vmx_exit_ctls_high = 0;
 #endif
+   nested_vmx_exit_ctls_high |= VM_EXIT_LOAD_IA32_EFER;
 
/* entry controls */
rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2064,6 +2065,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
nested_vmx_entry_ctls_low = 0;
nested_vmx_entry_ctls_high &=
VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
+   nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_IA32_EFER;
 
/* cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -7050,10 +7052,18 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, 
struct vmcs12 *vmcs12)
vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
 
-   /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
-   vmcs_write32(VM_EXIT_CONTROLS,
-   vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
-   vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
+   /* L2->L1 exit controls are emulated - the hardware exit is to L0 so
+* we should use its exit controls. Note that IA32_MODE, LOAD_IA32_EFER
+* bits are further modified by vmx_set_efer() below.
+*/
+   vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+
+   /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
+* emulated by vmx_set_efer(), below.
+*/
+   vmcs_write32(VM_ENTRY_CONTROLS,
+   (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
+   ~VM_ENTRY_IA32E_MODE) |
(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
 
if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
-- 
1.8.2.1.610.g562af5b

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH -v2] x86: Add a Kconfig shortcut for kvm guest kernel

2013-04-25 Thread Ingo Molnar


* Borislav Petkov  wrote:

> From: Borislav Petkov 
> Date: Tue, 16 Apr 2013 18:24:34 +0200
> Subject: [PATCH -v2] x86: Add a Kconfig shortcut for kvm guest kernel
> 
> This is pretty useful for the case where people want to boot the
> resulting kernel in qemu/kvm. Instead of going and searching for each
> required option through the Kconfig maze, this single option should
> simply enable everything required/good to have to boot the resulting
> kernel in the guest.

Please mention:

 ' This patch is based on a similar utility patch of the external
   lkvm tree. '

> 
> Cc: Fengguang Wu 
> Originally-by: Pekka Enberg 
> Originally-by: Sasha Levin 
> Signed-off-by: Borislav Petkov 
> ---
> 
> 
> Here's v2 which should be addressing all review comments so far.
> 
> 
>  arch/x86/Kconfig | 38 ++
>  1 file changed, 38 insertions(+)
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 5651374d179f..76a95ffa959a 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -680,6 +680,44 @@ config KVM_GUEST
> underlying device model, the host provides the guest with
> timing infrastructure such as time of day, and system time
>  
> +config KVM_GUEST_COMMODITY_OPTIONS

Call this KVM_GUEST_COMMON_OPTIONS?

> + bool "Enable commodity options for a standalone KVM guest"
> + depends on KVM_GUEST
> + select NET
> + select NETDEVICES
> + select BLOCK
> + select BLK_DEV
> + select NETWORK_FILESYSTEMS
> + select INET
> + select EXPERIMENTAL
> + select TTY
> + select SERIAL_8250
> + select SERIAL_8250_CONSOLE
> + select IP_PNP
> + select IP_PNP_DHCP
> + select BINFMT_ELF
> + select PCI_MSI
> + select HAVE_ARCH_KGDB
> + select DEBUG_KERNEL
> + select KGDB
> + select KGDB_SERIAL_CONSOLE
> + select VIRTUALIZATION
> + select VIRTIO
> + select VIRTIO_RING
> + select VIRTIO_PCI
> + select VIRTIO_BLK
> + select VIRTIO_CONSOLE
> + select VIRTIO_NET
> + select 9P_FS
> + select NET_9P
> + select NET_9P_VIRTIO
> + ---help---
> +   Select guest kernel functionality which facilitates booting the
> +   kernel as a guest in qemu/kvm. This entails basic stuff like

s/qemu/qemu or lkvm

> +   serial support, kgdb, virtio and other so that you can be able to
> +   have commodity functionality like serial output from the guest,
> +   networking, etc.

And seemless host file system integration into guest context. (that is 
what the 9P options are about)

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Bug 53611] New: nVMX: Add nested EPT

2013-04-25 Thread Jan Kiszka

On 2013-04-25 10:00, Nakajima, Jun wrote:
> On Wed, Apr 24, 2013 at 8:55 AM, Nakajima, Jun  wrote:
>> Sorry about the slow progress. We've been distracted by some priority
>> things. The patches are ready (i.e. working), but we are cleaning them
>> up. I'll send what we have today.
> 
> So, I have sent them, and frankly we are still cleaning up.  Please
> bear with us.
> We are also sending one more patchset to deal with EPT
> misconfiguration, but Linux should run in L2 on top of L1 KVM.

That's great but - as Gleb already said - unfortunately not yet usable.
I'd like to rebase my fixes and enhancements (unrestricted guest mode
specifically) on top these days, and also run some tests with a non-KVM
guest. So, if git send-email is not yet working there, I would also be
happy about a public git repository.

Thanks,
Jan

signature.asc
Description: OpenPGP digital signature

Re: [PATCH 1/2] kvm: destroy emulated devices on VM exit

2013-04-25 Thread Alexander Graf


On 26.04.2013, at 02:11, Scott Wood wrote:

> The hassle of getting refcounting right was greater than the hassle
> of keeping a list of devices to destroy on VM exit.
> 
> Signed-off-by: Scott Wood 

Thanks, applied both to my irqfd mpic queue.


Alex

> ---
> arch/powerpc/kvm/mpic.c  |2 --
> include/linux/kvm_host.h |3 ++-
> virt/kvm/kvm_main.c  |   29 -
> 3 files changed, 18 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
> index d137df8..4ac98d1 100644
> --- a/arch/powerpc/kvm/mpic.c
> +++ b/arch/powerpc/kvm/mpic.c
> @@ -1788,7 +1788,6 @@ int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, 
> struct kvm_vcpu *vcpu,
>   if (opp->mpic_mode_mask == GCR_MODE_PROXY)
>   vcpu->arch.epr_flags |= KVMPPC_EPR_KERNEL;
> 
> - kvm_device_get(dev);
> out:
>   spin_unlock_irq(&opp->lock);
>   return ret;
> @@ -1804,7 +1803,6 @@ void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, 
> struct kvm_vcpu *vcpu)
>   BUG_ON(!opp->dst[vcpu->arch.irq_cpu_id].vcpu);
> 
>   opp->dst[vcpu->arch.irq_cpu_id].vcpu = NULL;
> - kvm_device_put(opp->dev);
> }
> 
> /*
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index feffbda..36c9776 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -393,6 +393,7 @@ struct kvm {
>   long mmu_notifier_count;
> #endif
>   long tlbs_dirty;
> + struct list_head devices;
> };
> 
> #define kvm_err(fmt, ...) \
> @@ -1069,8 +1070,8 @@ struct kvm_device_ops;
> struct kvm_device {
>   struct kvm_device_ops *ops;
>   struct kvm *kvm;
> - atomic_t users;
>   void *private;
> + struct list_head vm_node;
> };
> 
> /* create, destroy, and name are mandatory */
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index f6cd14d..5da9f02 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -504,6 +504,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
>   mutex_init(&kvm->irq_lock);
>   mutex_init(&kvm->slots_lock);
>   atomic_set(&kvm->users_count, 1);
> + INIT_LIST_HEAD(&kvm->devices);
> 
>   r = kvm_init_mmu_notifier(kvm);
>   if (r)
> @@ -581,6 +582,19 @@ void kvm_free_physmem(struct kvm *kvm)
>   kfree(kvm->memslots);
> }
> 
> +static void kvm_destroy_devices(struct kvm *kvm)
> +{
> + struct list_head *node, *tmp;
> +
> + list_for_each_safe(node, tmp, &kvm->devices) {
> + struct kvm_device *dev =
> + list_entry(node, struct kvm_device, vm_node);
> +
> + list_del(node);
> + dev->ops->destroy(dev);
> + }
> +}
> +
> static void kvm_destroy_vm(struct kvm *kvm)
> {
>   int i;
> @@ -600,6 +614,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
>   kvm_arch_flush_shadow_all(kvm);
> #endif
>   kvm_arch_destroy_vm(kvm);
> + kvm_destroy_devices(kvm);
>   kvm_free_physmem(kvm);
>   cleanup_srcu_struct(&kvm->srcu);
>   kvm_arch_free_vm(kvm);
> @@ -2195,23 +2210,11 @@ static long kvm_device_ioctl(struct file *filp, 
> unsigned int ioctl,
>   }
> }
> 
> -void kvm_device_get(struct kvm_device *dev)
> -{
> - atomic_inc(&dev->users);
> -}
> -
> -void kvm_device_put(struct kvm_device *dev)
> -{
> - if (atomic_dec_and_test(&dev->users))
> - dev->ops->destroy(dev);
> -}
> -
> static int kvm_device_release(struct inode *inode, struct file *filp)
> {
>   struct kvm_device *dev = filp->private_data;
>   struct kvm *kvm = dev->kvm;
> 
> - kvm_device_put(dev);
>   kvm_put_kvm(kvm);
>   return 0;
> }
> @@ -2257,7 +2260,6 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
> 
>   dev->ops = ops;
>   dev->kvm = kvm;
> - atomic_set(&dev->users, 1);
> 
>   ret = ops->create(dev, cd->type);
>   if (ret < 0) {
> @@ -2271,6 +2273,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
>   return ret;
>   }
> 
> + list_add(&dev->vm_node, &kvm->devices);
>   kvm_get_kvm(kvm);
>   cd->fd = ret;
>   return 0;
> -- 
> 1.7.10.4
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [PATCH] ppc: initialize GPRs as per epapr

2013-04-25 Thread Bhushan Bharat-R65777

This was supposed to go to qemu-devel.

Please Ignore this patch:

Thanks
-Bharat

> -Original Message-
> From: Bhushan Bharat-R65777
> Sent: Friday, April 26, 2013 11:44 AM
> To: kvm-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; Wood Scott-
> B07421
> Cc: Bhushan Bharat-R65777; Bhushan Bharat-R65777; Yoder Stuart-B08248
> Subject: [PATCH] ppc: initialize GPRs as per epapr
> 
> ePAPR defines the initial values of cpu registers. This patch initialize the
> GPRs as per ePAPR specification.
> 
> This resolves the issue of guest reboot/reset (guest hang on reboot).
> 
> Signed-off-by: Bharat Bhushan 
> Signed-off-by: Stuart Yoder 
> ---
>  hw/ppc/e500.c |7 +++
>  1 files changed, 7 insertions(+), 0 deletions(-)
> 
> diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c index c1bdb6b..a47f976 100644
> --- a/hw/ppc/e500.c
> +++ b/hw/ppc/e500.c
> @@ -37,6 +37,7 @@
>  #include "qemu/host-utils.h"
>  #include "hw/pci-host/ppce500.h"
> 
> +#define EPAPR_MAGIC(0x45504150)
>  #define BINARY_DEVICE_TREE_FILE"mpc8544ds.dtb"
>  #define UIMAGE_LOAD_BASE   0
>  #define DTC_LOAD_PAD   0x180
> @@ -444,6 +445,12 @@ static void ppce500_cpu_reset(void *opaque)
>  cs->halted = 0;
>  env->gpr[1] = (16<<20) - 8;
>  env->gpr[3] = bi->dt_base;
> +env->gpr[4] = 0;
> +env->gpr[5] = 0;
> +env->gpr[6] = EPAPR_MAGIC;
> +env->gpr[7] = (64 * 1024 * 1024);
> +env->gpr[8] = 0;
> +env->gpr[9] = 0;
>  env->nip = bi->entry;
>  mmubooke_create_initial_mapping(env);
>  }
> --
> 1.7.0.4


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] ppc: initialize GPRs as per epapr

2013-04-25 Thread Bharat Bhushan

ePAPR defines the initial values of cpu registers. This patch initialize
the GPRs as per ePAPR specification.

This resolves the issue of guest reboot/reset (guest hang on reboot).

Signed-off-by: Bharat Bhushan 
Signed-off-by: Stuart Yoder 
---
 hw/ppc/e500.c |7 +++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c
index c1bdb6b..a47f976 100644
--- a/hw/ppc/e500.c
+++ b/hw/ppc/e500.c
@@ -37,6 +37,7 @@
 #include "qemu/host-utils.h"
 #include "hw/pci-host/ppce500.h"
 
+#define EPAPR_MAGIC(0x45504150)
 #define BINARY_DEVICE_TREE_FILE"mpc8544ds.dtb"
 #define UIMAGE_LOAD_BASE   0
 #define DTC_LOAD_PAD   0x180
@@ -444,6 +445,12 @@ static void ppce500_cpu_reset(void *opaque)
 cs->halted = 0;
 env->gpr[1] = (16<<20) - 8;
 env->gpr[3] = bi->dt_base;
+env->gpr[4] = 0;
+env->gpr[5] = 0;
+env->gpr[6] = EPAPR_MAGIC;
+env->gpr[7] = (64 * 1024 * 1024);
+env->gpr[8] = 0;
+env->gpr[9] = 0;
 env->nip = bi->entry;
 mmubooke_create_initial_mapping(env);
 }
-- 
1.7.0.4


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [PATCH v10 7/7] KVM: VMX: Use posted interrupt to deliver virtual interrupt

2013-04-25 Thread Zhang, Yang Z

Yangminqiang wrote on 2013-04-26:
> Hi Yang Zhang,
> 
> Could you please let me know your CPU model or the CPU models which
> supports apic-v which your patch requires()? So that I could try you
> patches.
> 
>   Intel Software Developer's Manualm, Volume 3C,
>   System Programming Guide, Part 3. Ch29,
>   APIC VIRTUALIZATION AND VIRTUAL INTERRUPTS
> Or how can I know whether my hardware support those features listed in the
> manual above?
Ivytown or newer platform supported it. 

> Thanks,
> Steven
> 
> kvm-ow...@vger.kernel.org wrote on 2013-04-11:
>> Subject: [PATCH v10 7/7] KVM: VMX: Use posted interrupt to deliver virtual
>> interrupt
>> 
>> From: Yang Zhang 
>> 
>> If posted interrupt is avaliable, then uses it to inject virtual
>> interrupt to guest.
>> 
>> Signed-off-by: Yang Zhang 
>> ---
>>  arch/x86/kvm/lapic.c |   30 +++---
>>  arch/x86/kvm/vmx.c   |2 +-
>>  arch/x86/kvm/x86.c   |1 +
>>  3 files changed, 21 insertions(+), 12 deletions(-)
>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>> index dbf74c9..e29883c 100644
>> --- a/arch/x86/kvm/lapic.c
>> +++ b/arch/x86/kvm/lapic.c
>> @@ -353,6 +353,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic
>> *apic)
>>  if (!apic->irr_pending)
>>  return -1;
>> +kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
>>  result = apic_search_irr(apic);
>>  ASSERT(result == -1 || result >= 16);
>> @@ -683,18 +684,25 @@ static int __apic_accept_irq(struct kvm_lapic *apic,
>> int delivery_mode,
>>  if (dest_map)
>>  __set_bit(vcpu->vcpu_id, dest_map);
>> -result = !apic_test_and_set_irr(vector, apic);
>> -trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
>> -  trig_mode, vector, !result);
>> -if (!result) {
>> -if (trig_mode)
>> -apic_debug("level trig mode repeatedly for "
>> -"vector %d", vector);
>> -break;
>> -}
>> +if (kvm_x86_ops->deliver_posted_interrupt) {
>> +result = 1;
>> +kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
>> +} else {
>> +result = !apic_test_and_set_irr(vector, apic);
>> 
>> -kvm_make_request(KVM_REQ_EVENT, vcpu);
>> -kvm_vcpu_kick(vcpu);
>> +if (!result) {
>> +if (trig_mode)
>> +apic_debug("level trig mode repeatedly "
>> +"for vector %d", vector);
>> +goto out;
>> +}
>> +
>> +kvm_make_request(KVM_REQ_EVENT, vcpu);
>> +kvm_vcpu_kick(vcpu);
>> +}
>> +out:
>> +trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
>> +trig_mode, vector, !result);
>>  break;
>>  
>>  case APIC_DM_REMRD:
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 314b2ed..52b21da 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -84,7 +84,7 @@ module_param(vmm_exclusive, bool, S_IRUGO);
>>  static bool __read_mostly fasteoi = 1;
>>  module_param(fasteoi, bool, S_IRUGO);
>> -static bool __read_mostly enable_apicv;
>> +static bool __read_mostly enable_apicv = 1;
>>  module_param(enable_apicv, bool, S_IRUGO);
>>  
>>  /*
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 6147d24..628582f 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -2685,6 +2685,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>>  static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,  
>>
>>  struct kvm_lapic_state *s) { +  kvm_x86_ops->sync_pir_to_irr(vcpu);
>>  memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
>>  
>>  return 0;
>> --
>> 1.7.1
>> 
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to majord...@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Best regards,
Yang

RE: [PATCH v10 7/7] KVM: VMX: Use posted interrupt to deliver virtual interrupt

2013-04-25 Thread Yangminqiang

Hi Yang Zhang,

Could you please let me know your CPU model or the CPU models which supports 
apic-v which your patch requires()? So that I could try you patches. 

  Intel Software Developer's Manualm, Volume 3C, 
  System Programming Guide, Part 3. Ch29, 
  APIC VIRTUALIZATION AND VIRTUAL INTERRUPTS

Or how can I know whether my hardware support those features listed in the
manual above?

Thanks,
Steven

> -Original Message-
> From: kvm-ow...@vger.kernel.org [mailto:kvm-ow...@vger.kernel.org] On
> Behalf Of Yang Zhang
> Sent: Thursday, April 11, 2013 7:25 PM
> To: kvm@vger.kernel.org
> Cc: g...@redhat.com; mtosa...@redhat.com; xiantao.zh...@intel.com;
> jun.nakaj...@intel.com; Yang Zhang
> Subject: [PATCH v10 7/7] KVM: VMX: Use posted interrupt to deliver virtual
> interrupt
> 
> From: Yang Zhang 
> 
> If posted interrupt is avaliable, then uses it to inject virtual
> interrupt to guest.
> 
> Signed-off-by: Yang Zhang 
> ---
>  arch/x86/kvm/lapic.c |   30 +++---
>  arch/x86/kvm/vmx.c   |2 +-
>  arch/x86/kvm/x86.c   |1 +
>  3 files changed, 21 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index dbf74c9..e29883c 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -353,6 +353,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic
> *apic)
>   if (!apic->irr_pending)
>   return -1;
> 
> + kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
>   result = apic_search_irr(apic);
>   ASSERT(result == -1 || result >= 16);
> 
> @@ -683,18 +684,25 @@ static int __apic_accept_irq(struct kvm_lapic *apic,
> int delivery_mode,
>   if (dest_map)
>   __set_bit(vcpu->vcpu_id, dest_map);
> 
> - result = !apic_test_and_set_irr(vector, apic);
> - trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
> -   trig_mode, vector, !result);
> - if (!result) {
> - if (trig_mode)
> - apic_debug("level trig mode repeatedly for "
> - "vector %d", vector);
> - break;
> - }
> + if (kvm_x86_ops->deliver_posted_interrupt) {
> + result = 1;
> + kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
> + } else {
> + result = !apic_test_and_set_irr(vector, apic);
> 
> - kvm_make_request(KVM_REQ_EVENT, vcpu);
> - kvm_vcpu_kick(vcpu);
> + if (!result) {
> + if (trig_mode)
> + apic_debug("level trig mode repeatedly "
> + "for vector %d", vector);
> + goto out;
> + }
> +
> + kvm_make_request(KVM_REQ_EVENT, vcpu);
> + kvm_vcpu_kick(vcpu);
> + }
> +out:
> + trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
> + trig_mode, vector, !result);
>   break;
> 
>   case APIC_DM_REMRD:
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 314b2ed..52b21da 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -84,7 +84,7 @@ module_param(vmm_exclusive, bool, S_IRUGO);
>  static bool __read_mostly fasteoi = 1;
>  module_param(fasteoi, bool, S_IRUGO);
> 
> -static bool __read_mostly enable_apicv;
> +static bool __read_mostly enable_apicv = 1;
>  module_param(enable_apicv, bool, S_IRUGO);
> 
>  /*
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 6147d24..628582f 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2685,6 +2685,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>  static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
>   struct kvm_lapic_state *s)
>  {
> + kvm_x86_ops->sync_pir_to_irr(vcpu);
>   memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
> 
>   return 0;
> --
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 8/8 v3] KVM: PPC: e500: Add e6500 core to Kconfig description

2013-04-25 Thread tiejun.chen


On 04/25/2013 07:32 PM, Caraman Mihai Claudiu-B02008 wrote:

Is the flowing is fine with that generic machine, ppce500, to boot
P5040DS with
64bit,

./qemu-system-ppc64 -enable-kvm -m 1048 -nographic -M ppce500 -kernel
uImage
-initrd ramdisk.gz  -L . -append "root=/dev/ram rw console=ttyS0,115200"
-cpu
e5500 -dtb p5040ds.dtb

Thanks,

Tiejun


There is no need for -dtb.


With your comment, I use kvm-ppc-queue which top commit is be28a27c, "kvm/ppc: 
don't call complete_mmio_load when it's a store", in plus that patch you 
pointed	to build one uImage based on corenet64_smp_defconfig, but we need to 
enable CONFIG_PPC_QEMU_E500 manually, and select CONFIG_TICK_CPU_ACCOUNTING 
since the default CONFIG_VIRT_CPU_ACCOUNTING_NATIVE would introduce some trace 
when boot VM.


And perform as follows:

./qemu-system-ppc64 -enable-kvm -m 1048 -nographic -M ppce500 -kernel uImage 
-initrd ramdisk.gz  -L . -append "root=/dev/ram rw console=ttyS0,115200" -cpu e5500


But I can't see anything in the serial port.

Tiejun
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/1] kvm:book3e: Fix a build error

2013-04-25 Thread tiejun.chen


On 04/25/2013 08:11 PM, Caraman Mihai Claudiu-B02008 wrote:

-Original Message-
From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-
ow...@vger.kernel.org] On Behalf Of Tiejun Chen
Sent: Thursday, April 25, 2013 2:46 PM
To: ga...@kernel.crashing.org
Cc: linuxppc-...@lists.ozlabs.org; kvm-...@vger.kernel.org;
kvm@vger.kernel.org
Subject: [PATCH 1/1] kvm:book3e: Fix a build error

Commit cd66cc2e, "powerpc/85xx: Add AltiVec support for e6500", adds
support for AltiVec on a Book-E class processor, but while compiling
in the CONFIG_PPC_BOOK3E_64 and CONFIG_VIRTUALIZATION case, this
introduce the following error:

arch/powerpc/kernel/exceptions-64e.S:402: undefined reference to
`kvmppc_handler_42_0x01B'
arch/powerpc/kernel/built-in.o: In function `exc_altivec_assist_book3e':
arch/powerpc/kernel/exceptions-64e.S:424: undefined reference to
`kvmppc_handler_43_0x01B'
make: *** [vmlinux] Error 1

Looks we should add these altivec kvm handlers.

Signed-off-by: Tiejun Chen 
---
  arch/powerpc/kvm/bookehv_interrupts.S |5 +
  1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/kvm/bookehv_interrupts.S
b/arch/powerpc/kvm/bookehv_interrupts.S
index e8ed7d6..fa9c78a 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -319,6 +319,11 @@ kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(DBG), \
SPRN_DSRR0, SPRN_DSRR1, 0
  kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(CRIT), \
SPRN_CSRR0, SPRN_CSRR1, 0
+/* altivec */
+kvm_handler BOOKE_INTERRUPT_ALTIVEC_UNAVAIL, EX_PARAMS(GEN), \
+   SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_ALTIVEC_ASSIST, EX_PARAMS(GEN), \
+   SPRN_SRR0, SPRN_SRR1, 0
  #else
  /*
   * For input register values, see
arch/powerpc/include/asm/kvm_booke_hv_asm.h
--


It seems that you are not using kvm-ppc-queue branch.


This is just used to fix a build error in powerpc.git when introduce commit 
cd66cc2e, "powerpc/85xx: Add AltiVec support for e6500", in *powerpc.git* as I 
mentioned in this patch head.




I already have a patch ready for this (and AltiVec support is work


This change don't block your AltiVec support for kvm unless you think this 
change is wrong. And especially, we always can reproduce this error with/without 
enabling AltiVec, so I also don't think this should be suspended until support 
e6500 in kvm since kvm based on e5500 should work.


Tiejun


in progress) but we need first to pull e6500 kernel patches from
Linux tree into agraf.git.

-Mike










--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/2] kvm: destroy emulated devices on VM exit

2013-04-25 Thread Scott Wood

The hassle of getting refcounting right was greater than the hassle
of keeping a list of devices to destroy on VM exit.

Signed-off-by: Scott Wood 
---
 arch/powerpc/kvm/mpic.c  |2 --
 include/linux/kvm_host.h |3 ++-
 virt/kvm/kvm_main.c  |   29 -
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
index d137df8..4ac98d1 100644
--- a/arch/powerpc/kvm/mpic.c
+++ b/arch/powerpc/kvm/mpic.c
@@ -1788,7 +1788,6 @@ int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, 
struct kvm_vcpu *vcpu,
if (opp->mpic_mode_mask == GCR_MODE_PROXY)
vcpu->arch.epr_flags |= KVMPPC_EPR_KERNEL;
 
-   kvm_device_get(dev);
 out:
spin_unlock_irq(&opp->lock);
return ret;
@@ -1804,7 +1803,6 @@ void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, 
struct kvm_vcpu *vcpu)
BUG_ON(!opp->dst[vcpu->arch.irq_cpu_id].vcpu);
 
opp->dst[vcpu->arch.irq_cpu_id].vcpu = NULL;
-   kvm_device_put(opp->dev);
 }
 
 /*
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index feffbda..36c9776 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -393,6 +393,7 @@ struct kvm {
long mmu_notifier_count;
 #endif
long tlbs_dirty;
+   struct list_head devices;
 };
 
 #define kvm_err(fmt, ...) \
@@ -1069,8 +1070,8 @@ struct kvm_device_ops;
 struct kvm_device {
struct kvm_device_ops *ops;
struct kvm *kvm;
-   atomic_t users;
void *private;
+   struct list_head vm_node;
 };
 
 /* create, destroy, and name are mandatory */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f6cd14d..5da9f02 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -504,6 +504,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
mutex_init(&kvm->irq_lock);
mutex_init(&kvm->slots_lock);
atomic_set(&kvm->users_count, 1);
+   INIT_LIST_HEAD(&kvm->devices);
 
r = kvm_init_mmu_notifier(kvm);
if (r)
@@ -581,6 +582,19 @@ void kvm_free_physmem(struct kvm *kvm)
kfree(kvm->memslots);
 }
 
+static void kvm_destroy_devices(struct kvm *kvm)
+{
+   struct list_head *node, *tmp;
+
+   list_for_each_safe(node, tmp, &kvm->devices) {
+   struct kvm_device *dev =
+   list_entry(node, struct kvm_device, vm_node);
+
+   list_del(node);
+   dev->ops->destroy(dev);
+   }
+}
+
 static void kvm_destroy_vm(struct kvm *kvm)
 {
int i;
@@ -600,6 +614,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
kvm_arch_flush_shadow_all(kvm);
 #endif
kvm_arch_destroy_vm(kvm);
+   kvm_destroy_devices(kvm);
kvm_free_physmem(kvm);
cleanup_srcu_struct(&kvm->srcu);
kvm_arch_free_vm(kvm);
@@ -2195,23 +2210,11 @@ static long kvm_device_ioctl(struct file *filp, 
unsigned int ioctl,
}
 }
 
-void kvm_device_get(struct kvm_device *dev)
-{
-   atomic_inc(&dev->users);
-}
-
-void kvm_device_put(struct kvm_device *dev)
-{
-   if (atomic_dec_and_test(&dev->users))
-   dev->ops->destroy(dev);
-}
-
 static int kvm_device_release(struct inode *inode, struct file *filp)
 {
struct kvm_device *dev = filp->private_data;
struct kvm *kvm = dev->kvm;
 
-   kvm_device_put(dev);
kvm_put_kvm(kvm);
return 0;
 }
@@ -2257,7 +2260,6 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
 
dev->ops = ops;
dev->kvm = kvm;
-   atomic_set(&dev->users, 1);
 
ret = ops->create(dev, cd->type);
if (ret < 0) {
@@ -2271,6 +2273,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
return ret;
}
 
+   list_add(&dev->vm_node, &kvm->devices);
kvm_get_kvm(kvm);
cd->fd = ret;
return 0;
-- 
1.7.10.4


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/2] kvm/ppc/mpic: Eliminate mmio_mapped

2013-04-25 Thread Scott Wood

We no longer need to keep track of this now that MPIC destruction
always happens either during VM destruction (after MMIO has been
destroyed) or during a failed creation (before the fd has been exposed
to userspace, and thus before the MMIO region could have been
registered).

Signed-off-by: Scott Wood 
---
 arch/powerpc/kvm/mpic.c |   29 +
 1 file changed, 1 insertion(+), 28 deletions(-)

diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
index 4ac98d1..84e828e 100644
--- a/arch/powerpc/kvm/mpic.c
+++ b/arch/powerpc/kvm/mpic.c
@@ -190,7 +190,6 @@ struct openpic {
struct kvm_io_device mmio;
struct list_head mmio_regions;
atomic_t users;
-   bool mmio_mapped;
 
gpa_t reg_base;
spinlock_t lock;
@@ -1427,24 +1426,13 @@ static int kvm_mpic_write(struct kvm_io_device *this, 
gpa_t addr,
return ret;
 }
 
-static void kvm_mpic_dtor(struct kvm_io_device *this)
-{
-   struct openpic *opp = container_of(this, struct openpic, mmio);
-
-   opp->mmio_mapped = false;
-}
-
 static const struct kvm_io_device_ops mpic_mmio_ops = {
.read = kvm_mpic_read,
.write = kvm_mpic_write,
-   .destructor = kvm_mpic_dtor,
 };
 
 static void map_mmio(struct openpic *opp)
 {
-   BUG_ON(opp->mmio_mapped);
-   opp->mmio_mapped = true;
-
kvm_iodevice_init(&opp->mmio, &mpic_mmio_ops);
 
kvm_io_bus_register_dev(opp->kvm, KVM_MMIO_BUS,
@@ -1454,10 +1442,7 @@ static void map_mmio(struct openpic *opp)
 
 static void unmap_mmio(struct openpic *opp)
 {
-   if (opp->mmio_mapped) {
-   opp->mmio_mapped = false;
-   kvm_io_bus_unregister_dev(opp->kvm, KVM_MMIO_BUS, &opp->mmio);
-   }
+   kvm_io_bus_unregister_dev(opp->kvm, KVM_MMIO_BUS, &opp->mmio);
 }
 
 static int set_base_addr(struct openpic *opp, struct kvm_device_attr *attr)
@@ -1636,18 +1621,6 @@ static void mpic_destroy(struct kvm_device *dev)
 {
struct openpic *opp = dev->private;
 
-   if (opp->mmio_mapped) {
-   /*
-* Normally we get unmapped by kvm_io_bus_destroy(),
-* which happens before the VCPUs release their references.
-*
-* Thus, we should only get here if no VCPUs took a reference
-* to us in the first place.
-*/
-   WARN_ON(opp->nb_cpus != 0);
-   unmap_mmio(opp);
-   }
-
dev->kvm->arch.mpic = NULL;
kfree(opp);
 }
-- 
1.7.10.4


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH -v2] x86: Add a Kconfig shortcut for kvm guest kernel

2013-04-25 Thread Borislav Petkov

From: Borislav Petkov 
Date: Tue, 16 Apr 2013 18:24:34 +0200
Subject: [PATCH -v2] x86: Add a Kconfig shortcut for kvm guest kernel

This is pretty useful for the case where people want to boot the
resulting kernel in qemu/kvm. Instead of going and searching for each
required option through the Kconfig maze, this single option should
simply enable everything required/good to have to boot the resulting
kernel in the guest.

Cc: Fengguang Wu 
Originally-by: Pekka Enberg 
Originally-by: Sasha Levin 
Signed-off-by: Borislav Petkov 
---


Here's v2 which should be addressing all review comments so far.


 arch/x86/Kconfig | 38 ++
 1 file changed, 38 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5651374d179f..76a95ffa959a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -680,6 +680,44 @@ config KVM_GUEST
  underlying device model, the host provides the guest with
  timing infrastructure such as time of day, and system time
 
+config KVM_GUEST_COMMODITY_OPTIONS
+   bool "Enable commodity options for a standalone KVM guest"
+   depends on KVM_GUEST
+   select NET
+   select NETDEVICES
+   select BLOCK
+   select BLK_DEV
+   select NETWORK_FILESYSTEMS
+   select INET
+   select EXPERIMENTAL
+   select TTY
+   select SERIAL_8250
+   select SERIAL_8250_CONSOLE
+   select IP_PNP
+   select IP_PNP_DHCP
+   select BINFMT_ELF
+   select PCI_MSI
+   select HAVE_ARCH_KGDB
+   select DEBUG_KERNEL
+   select KGDB
+   select KGDB_SERIAL_CONSOLE
+   select VIRTUALIZATION
+   select VIRTIO
+   select VIRTIO_RING
+   select VIRTIO_PCI
+   select VIRTIO_BLK
+   select VIRTIO_CONSOLE
+   select VIRTIO_NET
+   select 9P_FS
+   select NET_9P
+   select NET_9P_VIRTIO
+   ---help---
+ Select guest kernel functionality which facilitates booting the
+ kernel as a guest in qemu/kvm. This entails basic stuff like
+ serial support, kgdb, virtio and other so that you can be able to
+ have commodity functionality like serial output from the guest,
+ networking, etc.
+
 source "arch/x86/lguest/Kconfig"
 
 config PARAVIRT_TIME_ACCOUNTING
-- 
1.8.2.135.g7b592fa

-- 
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/2] vfio: Convert container->group_lock to rwsem

2013-04-25 Thread Alex Williamson

All current users are writers, maintaining current mutual exclusion.
This lets us add read users next.

Signed-off-by: Alex Williamson 
---
 drivers/vfio/vfio.c |   21 +++--
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 21eddd9..073788e 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -57,7 +58,7 @@ struct vfio_iommu_driver {
 struct vfio_container {
struct kref kref;
struct list_headgroup_list;
-   struct mutexgroup_lock;
+   struct rw_semaphore group_lock;
struct vfio_iommu_driver*iommu_driver;
void*iommu_data;
 };
@@ -738,7 +739,7 @@ static long vfio_ioctl_check_extension(struct 
vfio_container *container,
return ret;
 }
 
-/* hold container->group_lock */
+/* hold write lock on container->group_lock */
 static int __vfio_container_attach_groups(struct vfio_container *container,
  struct vfio_iommu_driver *driver,
  void *data)
@@ -769,7 +770,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container 
*container,
struct vfio_iommu_driver *driver;
long ret = -ENODEV;
 
-   mutex_lock(&container->group_lock);
+   down_write(&container->group_lock);
 
/*
 * The container is designed to be an unprivileged interface while
@@ -780,7 +781,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container 
*container,
 * the container is deprivileged and returns to an unset state.
 */
if (list_empty(&container->group_list) || container->iommu_driver) {
-   mutex_unlock(&container->group_lock);
+   up_write(&container->group_lock);
return -EINVAL;
}
 
@@ -827,7 +828,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container 
*container,
 
mutex_unlock(&vfio.iommu_drivers_lock);
 skip_drivers_unlock:
-   mutex_unlock(&container->group_lock);
+   up_write(&container->group_lock);
 
return ret;
 }
@@ -882,7 +883,7 @@ static int vfio_fops_open(struct inode *inode, struct file 
*filep)
return -ENOMEM;
 
INIT_LIST_HEAD(&container->group_list);
-   mutex_init(&container->group_lock);
+   init_rwsem(&container->group_lock);
kref_init(&container->kref);
 
filep->private_data = container;
@@ -961,7 +962,7 @@ static void __vfio_group_unset_container(struct vfio_group 
*group)
struct vfio_container *container = group->container;
struct vfio_iommu_driver *driver;
 
-   mutex_lock(&container->group_lock);
+   down_write(&container->group_lock);
 
driver = container->iommu_driver;
if (driver)
@@ -979,7 +980,7 @@ static void __vfio_group_unset_container(struct vfio_group 
*group)
container->iommu_data = NULL;
}
 
-   mutex_unlock(&container->group_lock);
+   up_write(&container->group_lock);
 
vfio_container_put(container);
 }
@@ -1039,7 +1040,7 @@ static int vfio_group_set_container(struct vfio_group 
*group, int container_fd)
container = f.file->private_data;
WARN_ON(!container); /* fget ensures we don't race vfio_release */
 
-   mutex_lock(&container->group_lock);
+   down_write(&container->group_lock);
 
driver = container->iommu_driver;
if (driver) {
@@ -1057,7 +1058,7 @@ static int vfio_group_set_container(struct vfio_group 
*group, int container_fd)
atomic_inc(&group->container_users);
 
 unlock_out:
-   mutex_unlock(&container->group_lock);
+   up_write(&container->group_lock);
fdput(f);
return ret;
 }

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/2] Protect against iommu driver disconnect

2013-04-25 Thread Alex Williamson

Michael Tsirkin pointed out that file operations on /dev/vfio/vfio
dereference iommu_driver and iommu_data without a lock.  If releasing
a group or unsetting the container occurs concurrently, we could race.
We currently use a mutex when setting this association, so we can
convert to a rwsem keeping the existing mutex critical sections as
down_writes and add down_reads where these are used.  Thanks,

Alex

---

Alex Williamson (2):
  vfio: Convert container->group_lock to rwsem
  vfio: Use down_reads to protect iommu disconnects


 drivers/vfio/vfio.c |   83 +++
 1 file changed, 57 insertions(+), 26 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/2] vfio: Use down_reads to protect iommu disconnects

2013-04-25 Thread Alex Williamson

If a group or device is released or a container is unset from a group
it can race against file ops on the container.  Protect these with
down_reads to allow concurrent users.

Signed-off-by: Alex Williamson 
Reported-by: Michael S. Tsirkin 
---
 drivers/vfio/vfio.c |   62 ++-
 1 file changed, 46 insertions(+), 16 deletions(-)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 073788e..ac7423b 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -704,9 +704,13 @@ EXPORT_SYMBOL_GPL(vfio_del_group_dev);
 static long vfio_ioctl_check_extension(struct vfio_container *container,
   unsigned long arg)
 {
-   struct vfio_iommu_driver *driver = container->iommu_driver;
+   struct vfio_iommu_driver *driver;
long ret = 0;
 
+   down_read(&container->group_lock);
+
+   driver = container->iommu_driver;
+
switch (arg) {
/* No base extensions yet */
default:
@@ -736,6 +740,8 @@ static long vfio_ioctl_check_extension(struct 
vfio_container *container,
 VFIO_CHECK_EXTENSION, arg);
}
 
+   up_read(&container->group_lock);
+
return ret;
 }
 
@@ -844,9 +850,6 @@ static long vfio_fops_unl_ioctl(struct file *filep,
if (!container)
return ret;
 
-   driver = container->iommu_driver;
-   data = container->iommu_data;
-
switch (cmd) {
case VFIO_GET_API_VERSION:
ret = VFIO_API_VERSION;
@@ -858,8 +861,15 @@ static long vfio_fops_unl_ioctl(struct file *filep,
ret = vfio_ioctl_set_iommu(container, arg);
break;
default:
+   down_read(&container->group_lock);
+
+   driver = container->iommu_driver;
+   data = container->iommu_data;
+
if (driver) /* passthrough all unrecognized ioctls */
ret = driver->ops->ioctl(data, cmd, arg);
+
+   up_read(&container->group_lock);
}
 
return ret;
@@ -910,35 +920,55 @@ static ssize_t vfio_fops_read(struct file *filep, char 
__user *buf,
  size_t count, loff_t *ppos)
 {
struct vfio_container *container = filep->private_data;
-   struct vfio_iommu_driver *driver = container->iommu_driver;
+   struct vfio_iommu_driver *driver;
+   ssize_t ret = -EINVAL;
 
-   if (unlikely(!driver || !driver->ops->read))
-   return -EINVAL;
+   down_read(&container->group_lock);
+
+   driver = container->iommu_driver;
+   if (likely(driver && driver->ops->read))
+   ret = driver->ops->read(container->iommu_data,
+   buf, count, ppos);
 
-   return driver->ops->read(container->iommu_data, buf, count, ppos);
+   up_read(&container->group_lock);
+
+   return ret;
 }
 
 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
   size_t count, loff_t *ppos)
 {
struct vfio_container *container = filep->private_data;
-   struct vfio_iommu_driver *driver = container->iommu_driver;
+   struct vfio_iommu_driver *driver;
+   ssize_t ret = -EINVAL;
 
-   if (unlikely(!driver || !driver->ops->write))
-   return -EINVAL;
+   down_read(&container->group_lock);
+
+   driver = container->iommu_driver;
+   if (likely(driver && driver->ops->write))
+   ret = driver->ops->write(container->iommu_data,
+buf, count, ppos);
+
+   up_read(&container->group_lock);
 
-   return driver->ops->write(container->iommu_data, buf, count, ppos);
+   return ret;
 }
 
 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
 {
struct vfio_container *container = filep->private_data;
-   struct vfio_iommu_driver *driver = container->iommu_driver;
+   struct vfio_iommu_driver *driver;
+   int ret = -EINVAL;
 
-   if (unlikely(!driver || !driver->ops->mmap))
-   return -EINVAL;
+   down_read(&container->group_lock);
 
-   return driver->ops->mmap(container->iommu_data, vma);
+   driver = container->iommu_driver;
+   if (likely(driver && driver->ops->mmap))
+   ret = driver->ops->mmap(container->iommu_data, vma);
+
+   up_read(&container->group_lock);
+
+   return ret;
 }
 
 static const struct file_operations vfio_fops = {

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] kvm, svm: Fix typo in printk message

2013-04-25 Thread Borislav Petkov

From: Borislav Petkov 

It is "exit_int_info". It is actually EXITINTINFO in the official docs
but we don't like screaming docs.

Signed-off-by: Borislav Petkov 
---
 arch/x86/kvm/svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a3bba7786ecc..272d29844cc5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3491,7 +3491,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
-   printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
+   printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
   "exit_code 0x%x\n",
   __func__, svm->vmcb->control.exit_int_info,
   exit_code);
-- 
1.8.2.135.g7b592fa

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 16/17] KVM: PPC: MPIC: Add support for KVM_IRQ_LINE

2013-04-25 Thread Alexander Graf


On 25.04.2013, at 21:03, Scott Wood wrote:

> On 04/25/2013 09:49:23 AM, Alexander Graf wrote:
>> On 25.04.2013, at 13:30, Alexander Graf wrote:
>> >
>> > On 19.04.2013, at 20:51, Scott Wood wrote:
>> >
>> >> On 04/19/2013 09:06:27 AM, Alexander Graf wrote:
>> >>> Now that all pieces are in place for reusing generic irq infrastructure,
>> >>> we can copy x86's implementation of KVM_IRQ_LINE irq injection and simply
>> >>> reuse it for PPC, as it will work there just as well.
>> >>> Signed-off-by: Alexander Graf 
>> >>> ---
>> >>> arch/powerpc/include/uapi/asm/kvm.h |1 +
>> >>> arch/powerpc/kvm/powerpc.c  |   13 +
>> >>> 2 files changed, 14 insertions(+), 0 deletions(-)
>> >>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
>> >>> b/arch/powerpc/include/uapi/asm/kvm.h
>> >>> index 3537bf3..dbb2ac2 100644
>> >>> --- a/arch/powerpc/include/uapi/asm/kvm.h
>> >>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
>> >>> @@ -26,6 +26,7 @@
>> >>> #define __KVM_HAVE_SPAPR_TCE
>> >>> #define __KVM_HAVE_PPC_SMT
>> >>> #define __KVM_HAVE_IRQCHIP
>> >>> +#define __KVM_HAVE_IRQ_LINE
>> >>> struct kvm_regs {
>> >>>  __u64 pc;
>> >>> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
>> >>> index c431fea..874c106 100644
>> >>> --- a/arch/powerpc/kvm/powerpc.c
>> >>> +++ b/arch/powerpc/kvm/powerpc.c
>> >>> @@ -33,6 +33,7 @@
>> >>> #include 
>> >>> #include 
>> >>> #include "timing.h"
>> >>> +#include "irq.h"
>> >>> #include "../mm/mmu_decl.h"
>> >>> #define CREATE_TRACE_POINTS
>> >>> @@ -945,6 +946,18 @@ static int kvm_vm_ioctl_get_pvinfo(struct 
>> >>> kvm_ppc_pvinfo *pvinfo)
>> >>>  return 0;
>> >>> }
>> >>> +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level 
>> >>> *irq_event,
>> >>> +  bool line_status)
>> >>> +{
>> >>> +if (!irqchip_in_kernel(kvm))
>> >>> +return -ENXIO;
>> >>> +
>> >>> +irq_event->status = kvm_set_irq(kvm, 
>> >>> KVM_USERSPACE_IRQ_SOURCE_ID,
>> >>> +irq_event->irq, 
>> >>> irq_event->level,
>> >>> +line_status);
>> >>> +return 0;
>> >>> +}
>> >>
>> >> As Paul noted in the XICS patchset, this could reference an MPIC that has 
>> >> gone away if the user never attached any vcpus and then closed the MPIC 
>> >> fd.  It's not a reasonable use case, but it could be used malicously to 
>> >> get the kernel to access a bad pointer.  The irqchip_in_kernel check 
>> >> helps somewhat, but it's meant for ensuring that the creation has 
>> >> happened -- it's racy if used for ensuring that destruction hasn't 
>> >> happened.
>> >>
>> >> The problem is rooted in the awkwardness of performing an operation that 
>> >> logically should be on the MPIC fd, but is instead being done on the vm 
>> >> fd.
>> >>
>> >> I think these three steps would fix it (the first two seem like things we 
>> >> should be doing anyway):
>> >> - During MPIC destruction, make sure MPIC deregisters all routes that 
>> >> reference it.
>> >> - In kvm_set_irq(), do not release the RCU read lock until after the 
>> >> set() function has been called.
>> >> - Do not hook up kvm_send_userspace_msi() to MPIC or other new irqchips, 
>> >> as that bypasses the RCU lock.  It could be supported as a device fd 
>> >> ioctl if desired, or it could be reworked to operate on an RCU-managed 
>> >> list of MSI handlers, though MPIC really doesn't need this at all.
>> >
>> > Can't we just add an RCU lock in the send_userspace_msi case? I don't 
>> > think we should handle MSIs any differently from normal IRQs.
> 
> Well, you can't *just* add the RCU lock -- you need to add data to be managed 
> via RCU (e.g. a list of MSI callbacks, or at least a boolean indicating 
> whether calling the MSI code is OK).

Well, we'd just access a random pin routing :).

> 
>> In fact I'm having a hard time verifying that we're always accessing things 
>> with proper locks held. I'm pretty sure we're missing a few cases.
> 
> Any path in particular?

I'm already getting confused on whether normal MMIO accesses are always safe.

> 
>> So how about we delay mpic destruction to vm destruction? We simply add one 
>> user too many when we spawn the mpic and put it on vm_destruct. That way 
>> users _can_ destroy mpics, but they will only be really free'd once the vm 
>> is also gone.
> 
> That's what we originally had before the fd conversion.  If we want it again, 
> we'll need to go back to maintaining a list of devices in KVM (though it 
> could be a linked list now that we don't need to use it for lookups), or have 
> some hardcoded MPIC hack.

Well, we could have an anonymous linked list of device pointers with a simple 
registration function. That way it's generic enough for any device to be kept 
alive until vm destruction if it wants that.

> IIRC I said back then that converting to fd would make destruction ordering 
> more of a pain...

I usually like to pick the raisins from

Re: [PATCH 16/17] KVM: PPC: MPIC: Add support for KVM_IRQ_LINE

2013-04-25 Thread Scott Wood


On 04/25/2013 09:49:23 AM, Alexander Graf wrote:


On 25.04.2013, at 13:30, Alexander Graf wrote:

>
> On 19.04.2013, at 20:51, Scott Wood wrote:
>
>> On 04/19/2013 09:06:27 AM, Alexander Graf wrote:
>>> Now that all pieces are in place for reusing generic irq  
infrastructure,
>>> we can copy x86's implementation of KVM_IRQ_LINE irq injection  
and simply

>>> reuse it for PPC, as it will work there just as well.
>>> Signed-off-by: Alexander Graf 
>>> ---
>>> arch/powerpc/include/uapi/asm/kvm.h |1 +
>>> arch/powerpc/kvm/powerpc.c  |   13 +
>>> 2 files changed, 14 insertions(+), 0 deletions(-)
>>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h  
b/arch/powerpc/include/uapi/asm/kvm.h

>>> index 3537bf3..dbb2ac2 100644
>>> --- a/arch/powerpc/include/uapi/asm/kvm.h
>>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
>>> @@ -26,6 +26,7 @@
>>> #define __KVM_HAVE_SPAPR_TCE
>>> #define __KVM_HAVE_PPC_SMT
>>> #define __KVM_HAVE_IRQCHIP
>>> +#define __KVM_HAVE_IRQ_LINE
>>> struct kvm_regs {
>>>__u64 pc;
>>> diff --git a/arch/powerpc/kvm/powerpc.c  
b/arch/powerpc/kvm/powerpc.c

>>> index c431fea..874c106 100644
>>> --- a/arch/powerpc/kvm/powerpc.c
>>> +++ b/arch/powerpc/kvm/powerpc.c
>>> @@ -33,6 +33,7 @@
>>> #include 
>>> #include 
>>> #include "timing.h"
>>> +#include "irq.h"
>>> #include "../mm/mmu_decl.h"
>>> #define CREATE_TRACE_POINTS
>>> @@ -945,6 +946,18 @@ static int kvm_vm_ioctl_get_pvinfo(struct  
kvm_ppc_pvinfo *pvinfo)

>>>return 0;
>>> }
>>> +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level  
*irq_event,

>>> +bool line_status)
>>> +{
>>> +  if (!irqchip_in_kernel(kvm))
>>> +  return -ENXIO;
>>> +
>>> +	irq_event->status = kvm_set_irq(kvm,  
KVM_USERSPACE_IRQ_SOURCE_ID,
>>> +	irq_event->irq,  
irq_event->level,

>>> +  line_status);
>>> +  return 0;
>>> +}
>>
>> As Paul noted in the XICS patchset, this could reference an MPIC  
that has gone away if the user never attached any vcpus and then  
closed the MPIC fd.  It's not a reasonable use case, but it could be  
used malicously to get the kernel to access a bad pointer.  The  
irqchip_in_kernel check helps somewhat, but it's meant for ensuring  
that the creation has happened -- it's racy if used for ensuring that  
destruction hasn't happened.

>>
>> The problem is rooted in the awkwardness of performing an  
operation that logically should be on the MPIC fd, but is instead  
being done on the vm fd.

>>
>> I think these three steps would fix it (the first two seem like  
things we should be doing anyway):
>> - During MPIC destruction, make sure MPIC deregisters all routes  
that reference it.
>> - In kvm_set_irq(), do not release the RCU read lock until after  
the set() function has been called.
>> - Do not hook up kvm_send_userspace_msi() to MPIC or other new  
irqchips, as that bypasses the RCU lock.  It could be supported as a  
device fd ioctl if desired, or it could be reworked to operate on an  
RCU-managed list of MSI handlers, though MPIC really doesn't need  
this at all.

>
> Can't we just add an RCU lock in the send_userspace_msi case? I  
don't think we should handle MSIs any differently from normal IRQs.


Well, you can't *just* add the RCU lock -- you need to add data to be  
managed via RCU (e.g. a list of MSI callbacks, or at least a boolean  
indicating whether calling the MSI code is OK).


In fact I'm having a hard time verifying that we're always accessing  
things with proper locks held. I'm pretty sure we're missing a few  
cases.


Any path in particular?

So how about we delay mpic destruction to vm destruction? We simply  
add one user too many when we spawn the mpic and put it on  
vm_destruct. That way users _can_ destroy mpics, but they will only  
be really free'd once the vm is also gone.


That's what we originally had before the fd conversion.  If we want it  
again, we'll need to go back to maintaining a list of devices in KVM  
(though it could be a linked list now that we don't need to use it for  
lookups), or have some hardcoded MPIC hack.


IIRC I said back then that converting to fd would make destruction  
ordering more of a pain...


-Scott
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 1/6] kvm: add device control API

2013-04-25 Thread Scott Wood


On 04/25/2013 01:22:04 PM, Gleb Natapov wrote:

On Thu, Apr 25, 2013 at 11:51:08AM -0500, Scott Wood wrote:
> On 04/25/2013 05:47:39 AM, Alexander Graf wrote:
> >
> >On 25.04.2013, at 11:43, Gleb Natapov wrote:
> >
> >>> +void kvm_device_put(struct kvm_device *dev)
> >>> +{
> >>> + if (atomic_dec_and_test(&dev->users))
> >>> + dev->ops->destroy(dev);
> >>> +}
> >>> +
> >>> +static int kvm_device_release(struct inode *inode, struct file
> >*filp)
> >>> +{
> >>> + struct kvm_device *dev = filp->private_data;
> >>> + struct kvm *kvm = dev->kvm;
> >>> +
> >>> + kvm_device_put(dev);
> >>> + kvm_put_kvm(kvm);
> >> We may put kvm only if users goes to zero, otherwise kvm can be
> >> freed while something holds a reference to a device. Why not make
> >> kvm_device_put() do it?
> >
> >Nice catch. I'll change the patch so it does the kvm_put_kvm
> >inside kvm_device_put's destroy branch.
>
> No, please don't.  The KVM reference being "put" here is associated
> with the file descriptor, not with the MPIC object.
Is it so? Device holds a pointer to kvm, so it increments kvm  
reference
to make sure the pointer is valid. What prevents kvm from been  
destroyed

while device is still in use in current code?


Where will that kvm pointer be used, after all the file descriptors go  
away and the vcpus stop running?  mmio_mapped guards against unmapping  
the MMIO if it's already been unmapped due to KVM destruction.  We  
don't have any timers or other delayed work.


Well, I do see one place, that Alex added -- the NULLing out of  
dev->kvm->arch.mpic, which didn't exist in my patchset.



> that change I think you'll have circular references and thus a
> memory leak, because the vcpus can hold a reference to the MPIC
> object.
>
How circular reference can be created?


MPIC holds reference on KVM, vcpu holds reference on MPIC, and vcpu is  
not destroyed until KVM is destroyed.


-Scott
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 1/6] kvm: add device control API

2013-04-25 Thread Gleb Natapov

On Thu, Apr 25, 2013 at 11:51:08AM -0500, Scott Wood wrote:
> On 04/25/2013 05:47:39 AM, Alexander Graf wrote:
> >
> >On 25.04.2013, at 11:43, Gleb Natapov wrote:
> >
> >>> +void kvm_device_put(struct kvm_device *dev)
> >>> +{
> >>> + if (atomic_dec_and_test(&dev->users))
> >>> + dev->ops->destroy(dev);
> >>> +}
> >>> +
> >>> +static int kvm_device_release(struct inode *inode, struct file
> >*filp)
> >>> +{
> >>> + struct kvm_device *dev = filp->private_data;
> >>> + struct kvm *kvm = dev->kvm;
> >>> +
> >>> + kvm_device_put(dev);
> >>> + kvm_put_kvm(kvm);
> >> We may put kvm only if users goes to zero, otherwise kvm can be
> >> freed while something holds a reference to a device. Why not make
> >> kvm_device_put() do it?
> >
> >Nice catch. I'll change the patch so it does the kvm_put_kvm
> >inside kvm_device_put's destroy branch.
> 
> No, please don't.  The KVM reference being "put" here is associated
> with the file descriptor, not with the MPIC object.
Is it so? Device holds a pointer to kvm, so it increments kvm reference
to make sure the pointer is valid. What prevents kvm from been destroyed
while device is still in use in current code?
 

>If you make
> that change I think you'll have circular references and thus a
> memory leak, because the vcpus can hold a reference to the MPIC
> object.
> 
How circular reference can be created?

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 15/17] KVM: PPC: Support irq routing and irqfd for in-kernel MPIC

2013-04-25 Thread Scott Wood

On 04/25/2013 04:58:51 AM, Alexander Graf wrote:

On 19.04.2013, at 20:02, Scott Wood wrote:

> On 04/19/2013 09:06:26 AM, Alexander Graf wrote:
>> +  if (notify_eoi != -1) {
>> +  spin_unlock_irq(&opp->lock);
>> +  kvm_notify_acked_irq(opp->kvm, 0, notify_eoi);
>> +  spin_lock_irq(&opp->lock);
>> +  }
>
> I'd rather not have the "_irq" here, which could break if we enter  
this patch via an "_irqsave" (I realize there currently is no such  
path that reaches EOI emulation).

>
> Will we ever set notify_eoi when addr != EOI?  I'm wondering why it  
was moved out of the switch statement, instead of being put at the  
end of the case EOI: code.

I doubt it, but that's for the compiler to optimize away. I found it  
cleaner for some reason to put it down there. I don't think it really  
matters.

Cleanliness is my concern as well.  It doesn't seem clean to  
arbitrarily split up the EOI implementation.

-Scott
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 1/6] kvm: add device control API

2013-04-25 Thread Scott Wood

On 04/25/2013 05:47:39 AM, Alexander Graf wrote:

On 25.04.2013, at 11:43, Gleb Natapov wrote:

>> +void kvm_device_put(struct kvm_device *dev)
>> +{
>> +  if (atomic_dec_and_test(&dev->users))
>> +  dev->ops->destroy(dev);
>> +}
>> +
>> +static int kvm_device_release(struct inode *inode, struct file  
*filp)

>> +{
>> +  struct kvm_device *dev = filp->private_data;
>> +  struct kvm *kvm = dev->kvm;
>> +
>> +  kvm_device_put(dev);
>> +  kvm_put_kvm(kvm);
> We may put kvm only if users goes to zero, otherwise kvm can be
> freed while something holds a reference to a device. Why not make
> kvm_device_put() do it?

Nice catch. I'll change the patch so it does the kvm_put_kvm inside  
kvm_device_put's destroy branch.

No, please don't.  The KVM reference being "put" here is associated  
with the file descriptor, not with the MPIC object.  If you make that  
change I think you'll have circular references and thus a memory leak,  
because the vcpus can hold a reference to the MPIC object.

-Scott
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2] KVM: PPC: cache flush for kernel managed pages

2013-04-25 Thread Bharat Bhushan

From: Bharat Bhushan 

Kernel can only access pages which maps as memory.
So flush only the valid kernel pages.

Signed-off-by: Bharat Bhushan 
---
v1->v2
 - move pfn_valid() check in kvmppc_mmu_flush_icache
 - Added comment to describe why this is needed

 arch/powerpc/include/asm/kvm_ppc.h |9 -
 1 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index f589307..4794de6 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -282,8 +282,15 @@ void kvmppc_init_lpid(unsigned long nr_lpids);
 
 static inline void kvmppc_mmu_flush_icache(pfn_t pfn)
 {
-   /* Clear i-cache for new pages */
struct page *page;
+   /*
+* We can only access pages that the kernel maps
+* as memory. Bail out for unmapped ones.
+*/
+   if (!pfn_valid(pfn))
+   return;
+
+   /* Clear i-cache for new pages */
page = pfn_to_page(pfn);
if (!test_bit(PG_arch_1, &page->flags)) {
flush_dcache_icache_page(page);
-- 
1.7.0.4


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 08/32] arm64: KVM: architecture specific MMU backend

2013-04-25 Thread Christoffer Dall

On Thu, Apr 25, 2013 at 5:59 AM, Marc Zyngier  wrote:
> On 24/04/13 17:55, Christoffer Dall wrote:
>> On Wed, Apr 24, 2013 at 4:03 AM, Marc Zyngier  wrote:
>>> On 23/04/13 23:58, Christoffer Dall wrote:
 On Mon, Apr 08, 2013 at 05:17:10PM +0100, Marc Zyngier wrote:
> Define the arm64 specific MMU backend:
> - HYP/kernel VA offset
> - S2 4/64kB definitions
> - S2 page table populating and flushing
> - icache cleaning
>
> Reviewed-by: Christopher Covington 
> Signed-off-by: Marc Zyngier 
> ---
>  arch/arm64/include/asm/kvm_mmu.h | 136 
> +++
>  1 file changed, 136 insertions(+)
>  create mode 100644 arch/arm64/include/asm/kvm_mmu.h
>
> diff --git a/arch/arm64/include/asm/kvm_mmu.h 
> b/arch/arm64/include/asm/kvm_mmu.h
> new file mode 100644
> index 000..2eb2230
> --- /dev/null
> +++ b/arch/arm64/include/asm/kvm_mmu.h
> @@ -0,0 +1,136 @@
> +/*
> + * Copyright (C) 2012,2013 - ARM Ltd
> + * Author: Marc Zyngier 
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program.  If not, see .
> + */
> +
> +#ifndef __ARM64_KVM_MMU_H__
> +#define __ARM64_KVM_MMU_H__
> +
> +#include 
> +#include 
> +
> +/*
> + * As we only have the TTBR0_EL2 register, we cannot express
> + * "negative" addresses. This makes it impossible to directly share
> + * mappings with the kernel.
> + *
> + * Instead, give the HYP mode its own VA region at a fixed offset from
> + * the kernel by just masking the top bits (which are all ones for a
> + * kernel address).

 For some reason I keep choking on this, despite it being very simple.
 We're just defining a different PAGE_OFFSET, right? Why not do a hard
 define as:

 #define HYP_PAGE_OFFSET_MASK  0x
 #define HYP_PAGE_OFFSET   0xffc0

 ...or change the second paragraph of the comment to say
 that we definethe HYP_PAGE_OFFSET to be 0x ffc0 .
>>>
>>> One of these days, VA_BITS will change to accommodate for more virtual
>>> space. When that day comes, I don't want to touch any of this because it
>>> did hurt enough when writing it. As such, I'll refrain from hardcoding
>>> anything.
>>>
>>> I don't mind a comment, though.
>>>
> + */
> +#define HYP_PAGE_OFFSET_SHIFT   VA_BITS
> +#define HYP_PAGE_OFFSET_MASK((UL(1) << HYP_PAGE_OFFSET_SHIFT) - 
> 1)

 In any case, is there a reason for the HYP_PAGE_OFFSET_SHIFT
 indirection? It may be simpler without...
>>>
>>> It is common practice to have XXX_SHIFT and XXX_MASK together.
>>>
> +#define HYP_PAGE_OFFSET (PAGE_OFFSET & HYP_PAGE_OFFSET_MASK)
> +
> +/*
> + * Our virtual mapping for the idmap-ed MMU-enable code. Must be
> + * shared across all the page-tables. Conveniently, we use the last
> + * possible page, where no kernel mapping will ever exist.
> + */
> +#define TRAMPOLINE_VA   (HYP_PAGE_OFFSET_MASK & PAGE_MASK)

 hmmm, ok, here it's kind of nice to have that define correlation, so
 maybe it's not cleaner.  Something should be improved here in the define
 or the comment to make it more clear.  Perhaps just adding the real
 constants in the comment or in Documentation/arm64/memory.txt would
 help.
>>>
>>> Yes, I plan to write something there.
>>>
> +
> +#ifdef __ASSEMBLY__
> +
> +/*
> + * Convert a kernel VA into a HYP VA.
> + * reg: VA to be converted.
> + */
> +.macro kern_hyp_va  reg
> +and \reg, \reg, #HYP_PAGE_OFFSET_MASK
> +.endm
> +
> +#else
> +
> +#include 
> +
> +#define KERN_TO_HYP(kva)((unsigned long)kva - PAGE_OFFSET + 
> HYP_PAGE_OFFSET)
> +
> +/*
> + * Align KVM with the kernel's view of physical memory. Should be
> + * 40bit IPA, with PGD being 8kB aligned.
> + */
> +#define KVM_PHYS_SHIFT  PHYS_MASK_SHIFT
> +#define KVM_PHYS_SIZE   (1UL << KVM_PHYS_SHIFT)
> +#define KVM_PHYS_MASK   (KVM_PHYS_SIZE - 1UL)
> +
> +#ifdef CONFIG_ARM64_64K_PAGES
> +#define PAGE_LEVELS 2
> +#define BITS_PER_LEVEL  13
> +#else  /* 4kB pages */
> +#define PAGE_LEVELS 3
> +#define BITS_PER_LEVEL  9
> +#endif

RE: [PATCH] KVM : PPC : cache flush for kernel managed pages

2013-04-25 Thread Bhushan Bharat-R65777



> -Original Message-
> From: Alexander Graf [mailto:ag...@suse.de]
> Sent: Thursday, April 25, 2013 8:36 PM
> To: Bhushan Bharat-R65777
> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Wood Scott-B07421; Bhushan
> Bharat-R65777
> Subject: Re: [PATCH] KVM : PPC : cache flush for kernel managed pages
> 
> 
> On 23.04.2013, at 08:39, Bharat Bhushan wrote:
> 
> > Kernel should only try flushing pages which are managed by kernel.
> > pfn_to_page will returns junk struct page for pages not managed by
> > kernel, so if kernel will try to flush direct mapped memory or direct
> > assigned device mapping then it will work on junk struct page.
> >
> > Signed-off-by: Bharat Bhushan 
> > ---
> > arch/powerpc/kvm/e500_mmu_host.c |3 ++-
> > 1 files changed, 2 insertions(+), 1 deletions(-)
> >
> > diff --git a/arch/powerpc/kvm/e500_mmu_host.c
> > b/arch/powerpc/kvm/e500_mmu_host.c
> > index 1c6a9d7..e07da21 100644
> > --- a/arch/powerpc/kvm/e500_mmu_host.c
> > +++ b/arch/powerpc/kvm/e500_mmu_host.c
> > @@ -455,7 +455,8 @@ static inline int kvmppc_e500_shadow_map(struct
> kvmppc_vcpu_e500 *vcpu_e500,
> > ref, gvaddr, stlbe);
> >
> > /* Clear i-cache for new pages */
> > -   kvmppc_mmu_flush_icache(pfn);
> > +   if (pfn_valid(pfn))
> > +   kvmppc_mmu_flush_icache(pfn);
> 
> Could you please move the check into kvmppc_mmu_flush_icache()? That way we're
> guaranteed we can't screw up cache flushes ever :).
> 
> Also, please add a comment saying why we need this.

Ok

-Bharat

> 
> 
> Alex
> 
> >
> > /* Drop refcount on page, so that mmu notifiers can clear it */
> > kvm_release_pfn_clean(pfn);
> > --
> > 1.7.0.4
> >
> >
> 


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM : PPC : cache flush for kernel managed pages

2013-04-25 Thread Alexander Graf


On 23.04.2013, at 08:39, Bharat Bhushan wrote:

> Kernel should only try flushing pages which are managed by kernel.
> pfn_to_page will returns junk struct page for pages not managed by kernel,
> so if kernel will try to flush direct mapped memory or direct assigned device
> mapping then it will work on junk struct page.
> 
> Signed-off-by: Bharat Bhushan 
> ---
> arch/powerpc/kvm/e500_mmu_host.c |3 ++-
> 1 files changed, 2 insertions(+), 1 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/e500_mmu_host.c 
> b/arch/powerpc/kvm/e500_mmu_host.c
> index 1c6a9d7..e07da21 100644
> --- a/arch/powerpc/kvm/e500_mmu_host.c
> +++ b/arch/powerpc/kvm/e500_mmu_host.c
> @@ -455,7 +455,8 @@ static inline int kvmppc_e500_shadow_map(struct 
> kvmppc_vcpu_e500 *vcpu_e500,
>   ref, gvaddr, stlbe);
> 
>   /* Clear i-cache for new pages */
> - kvmppc_mmu_flush_icache(pfn);
> + if (pfn_valid(pfn))
> + kvmppc_mmu_flush_icache(pfn);

Could you please move the check into kvmppc_mmu_flush_icache()? That way we're 
guaranteed we can't screw up cache flushes ever :).

Also, please add a comment saying why we need this.


Alex

> 
>   /* Drop refcount on page, so that mmu notifiers can clear it */
>   kvm_release_pfn_clean(pfn);
> -- 
> 1.7.0.4
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 16/17] KVM: PPC: MPIC: Add support for KVM_IRQ_LINE

2013-04-25 Thread Alexander Graf


On 25.04.2013, at 13:30, Alexander Graf wrote:

> 
> On 19.04.2013, at 20:51, Scott Wood wrote:
> 
>> On 04/19/2013 09:06:27 AM, Alexander Graf wrote:
>>> Now that all pieces are in place for reusing generic irq infrastructure,
>>> we can copy x86's implementation of KVM_IRQ_LINE irq injection and simply
>>> reuse it for PPC, as it will work there just as well.
>>> Signed-off-by: Alexander Graf 
>>> ---
>>> arch/powerpc/include/uapi/asm/kvm.h |1 +
>>> arch/powerpc/kvm/powerpc.c  |   13 +
>>> 2 files changed, 14 insertions(+), 0 deletions(-)
>>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
>>> b/arch/powerpc/include/uapi/asm/kvm.h
>>> index 3537bf3..dbb2ac2 100644
>>> --- a/arch/powerpc/include/uapi/asm/kvm.h
>>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
>>> @@ -26,6 +26,7 @@
>>> #define __KVM_HAVE_SPAPR_TCE
>>> #define __KVM_HAVE_PPC_SMT
>>> #define __KVM_HAVE_IRQCHIP
>>> +#define __KVM_HAVE_IRQ_LINE
>>> struct kvm_regs {
>>> __u64 pc;
>>> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
>>> index c431fea..874c106 100644
>>> --- a/arch/powerpc/kvm/powerpc.c
>>> +++ b/arch/powerpc/kvm/powerpc.c
>>> @@ -33,6 +33,7 @@
>>> #include 
>>> #include 
>>> #include "timing.h"
>>> +#include "irq.h"
>>> #include "../mm/mmu_decl.h"
>>> #define CREATE_TRACE_POINTS
>>> @@ -945,6 +946,18 @@ static int kvm_vm_ioctl_get_pvinfo(struct 
>>> kvm_ppc_pvinfo *pvinfo)
>>> return 0;
>>> }
>>> +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
>>> + bool line_status)
>>> +{
>>> +   if (!irqchip_in_kernel(kvm))
>>> +   return -ENXIO;
>>> +
>>> +   irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
>>> +   irq_event->irq, irq_event->level,
>>> +   line_status);
>>> +   return 0;
>>> +}
>> 
>> As Paul noted in the XICS patchset, this could reference an MPIC that has 
>> gone away if the user never attached any vcpus and then closed the MPIC fd.  
>> It's not a reasonable use case, but it could be used malicously to get the 
>> kernel to access a bad pointer.  The irqchip_in_kernel check helps somewhat, 
>> but it's meant for ensuring that the creation has happened -- it's racy if 
>> used for ensuring that destruction hasn't happened.
>> 
>> The problem is rooted in the awkwardness of performing an operation that 
>> logically should be on the MPIC fd, but is instead being done on the vm fd.
>> 
>> I think these three steps would fix it (the first two seem like things we 
>> should be doing anyway):
>> - During MPIC destruction, make sure MPIC deregisters all routes that 
>> reference it.
>> - In kvm_set_irq(), do not release the RCU read lock until after the set() 
>> function has been called.
>> - Do not hook up kvm_send_userspace_msi() to MPIC or other new irqchips, as 
>> that bypasses the RCU lock.  It could be supported as a device fd ioctl if 
>> desired, or it could be reworked to operate on an RCU-managed list of MSI 
>> handlers, though MPIC really doesn't need this at all.
> 
> Can't we just add an RCU lock in the send_userspace_msi case? I don't think 
> we should handle MSIs any differently from normal IRQs.

In fact I'm having a hard time verifying that we're always accessing things 
with proper locks held. I'm pretty sure we're missing a few cases.

So how about we delay mpic destruction to vm destruction? We simply add one 
user too many when we spawn the mpic and put it on vm_destruct. That way users 
_can_ destroy mpics, but they will only be really free'd once the vm is also 
gone.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 1/6] kvm: add device control API

2013-04-25 Thread Gleb Natapov

On Thu, Apr 25, 2013 at 03:45:14PM +0200, Alexander Graf wrote:
> >>> Please move struct definitions and KVM_CREATE_DEVICE_TEST define out
> >>> from ioctl definition block.
> >> 
> >> Let me change that in my tree...
> >> 
> > So are you sending this via your tree and I should not apply it directly?
> 
> I was hoping to have things ready very soon for you to just pull...
> 
Make sense since there are PPC patches that depend on this one. 3.10 merge 
windows
will very likely open next week though...

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 1/6] kvm: add device control API

2013-04-25 Thread Alexander Graf


On 25.04.2013, at 14:07, Gleb Natapov wrote:

> On Thu, Apr 25, 2013 at 12:47:39PM +0200, Alexander Graf wrote:
>> 
>> On 25.04.2013, at 11:43, Gleb Natapov wrote:
>> 
>>> On Fri, Apr 12, 2013 at 07:08:42PM -0500, Scott Wood wrote:
 Currently, devices that are emulated inside KVM are configured in a
 hardcoded manner based on an assumption that any given architecture
 only has one way to do it.  If there's any need to access device state,
 it is done through inflexible one-purpose-only IOCTLs (e.g.
 KVM_GET/SET_LAPIC).  Defining new IOCTLs for every little thing is
 cumbersome and depletes a limited numberspace.
 
 This API provides a mechanism to instantiate a device of a certain
 type, returning an ID that can be used to set/get attributes of the
 device.  Attributes may include configuration parameters (e.g.
 register base address), device state, operational commands, etc.  It
 is similar to the ONE_REG API, except that it acts on devices rather
 than vcpus.
 
 Both device types and individual attributes can be tested without having
 to create the device or get/set the attribute, without the need for
 separately managing enumerated capabilities.
 
 Signed-off-by: Scott Wood 
 ---
 v4:
 - Move some boilerplate back into generic code, as requested by Gleb.
  File descriptor management and reference counting is no longer the
  concern of the device implementation.
 
 - Don't hold kvm->lock during create.  The original reasons
  for doing so have vanished as for as MPIC is concerned, and
  this avoids needing to answer the question of whether to
  hold the lock during destroy as well.
 
  Paul, you may need to acquire the lock yourself in kvm_create_xics()
  to protect the -EEXIST check.
 
 v3: remove some changes that were merged into this patch by accident,
 and fix the error documentation for KVM_CREATE_DEVICE.
 ---
 Documentation/virtual/kvm/api.txt|   70 
 Documentation/virtual/kvm/devices/README |1 +
 include/linux/kvm_host.h |   35 
 include/uapi/linux/kvm.h |   27 +++
 virt/kvm/kvm_main.c  |  129 
 ++
 5 files changed, 262 insertions(+)
 create mode 100644 Documentation/virtual/kvm/devices/README
 
 diff --git a/Documentation/virtual/kvm/api.txt 
 b/Documentation/virtual/kvm/api.txt
 index 976eb65..d52f3f9 100644
 --- a/Documentation/virtual/kvm/api.txt
 +++ b/Documentation/virtual/kvm/api.txt
 @@ -2173,6 +2173,76 @@ header; first `n_valid' valid entries with contents 
 from the data
 written, then `n_invalid' invalid entries, invalidating any previously
 valid entries found.
 
 +4.79 KVM_CREATE_DEVICE
 +
 +Capability: KVM_CAP_DEVICE_CTRL
 +Type: vm ioctl
 +Parameters: struct kvm_create_device (in/out)
 +Returns: 0 on success, -1 on error
 +Errors:
 +  ENODEV: The device type is unknown or unsupported
 +  EEXIST: Device already created, and this type of device may not
 +  be instantiated multiple times
 +
 +  Other error conditions may be defined by individual device types or
 +  have their standard meanings.
 +
 +Creates an emulated device in the kernel.  The file descriptor returned
 +in fd can be used with KVM_SET/GET/HAS_DEVICE_ATTR.
 +
 +If the KVM_CREATE_DEVICE_TEST flag is set, only test whether the
 +device type is supported (not necessarily whether it can be created
 +in the current vm).
 +
 +Individual devices should not define flags.  Attributes should be used
 +for specifying any behavior that is not implied by the device type
 +number.
 +
 +struct kvm_create_device {
 +  __u32   type;   /* in: KVM_DEV_TYPE_xxx */
 +  __u32   fd; /* out: device handle */
 +  __u32   flags;  /* in: KVM_CREATE_DEVICE_xxx */
 +};
>>> Should we add __u32 padding here to make struct size multiple of u64?
>> 
>> Do you know of any arch that pads structs to u64 boundaries? x86_64 doesn't 
>> and ppc64 doesn't either.
>> 
> Not really. I just notices that we pad some structures to that effect.

I don't think we really need to :).

> 
>>> 
 +
 +4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR
 +
 +Capability: KVM_CAP_DEVICE_CTRL
 +Type: device ioctl
 +Parameters: struct kvm_device_attr
 +Returns: 0 on success, -1 on error
 +Errors:
 +  ENXIO:  The group or attribute is unknown/unsupported for this device
 +  EPERM:  The attribute cannot (currently) be accessed this way
 +  (e.g. read-only attribute, or attribute that only makes
 +  sense when the device is in a different state)
 +
 +  Other error conditions may be defined by individual device types.
 +
 +Gets/sets a specifie

Re: [PATCH v3 08/32] arm64: KVM: architecture specific MMU backend

2013-04-25 Thread Marc Zyngier

On 24/04/13 17:55, Christoffer Dall wrote:
> On Wed, Apr 24, 2013 at 4:03 AM, Marc Zyngier  wrote:
>> On 23/04/13 23:58, Christoffer Dall wrote:
>>> On Mon, Apr 08, 2013 at 05:17:10PM +0100, Marc Zyngier wrote:
 Define the arm64 specific MMU backend:
 - HYP/kernel VA offset
 - S2 4/64kB definitions
 - S2 page table populating and flushing
 - icache cleaning

 Reviewed-by: Christopher Covington 
 Signed-off-by: Marc Zyngier 
 ---
  arch/arm64/include/asm/kvm_mmu.h | 136 
 +++
  1 file changed, 136 insertions(+)
  create mode 100644 arch/arm64/include/asm/kvm_mmu.h

 diff --git a/arch/arm64/include/asm/kvm_mmu.h 
 b/arch/arm64/include/asm/kvm_mmu.h
 new file mode 100644
 index 000..2eb2230
 --- /dev/null
 +++ b/arch/arm64/include/asm/kvm_mmu.h
 @@ -0,0 +1,136 @@
 +/*
 + * Copyright (C) 2012,2013 - ARM Ltd
 + * Author: Marc Zyngier 
 + *
 + * This program is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU General Public License version 2 as
 + * published by the Free Software Foundation.
 + *
 + * This program is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 + * GNU General Public License for more details.
 + *
 + * You should have received a copy of the GNU General Public License
 + * along with this program.  If not, see .
 + */
 +
 +#ifndef __ARM64_KVM_MMU_H__
 +#define __ARM64_KVM_MMU_H__
 +
 +#include 
 +#include 
 +
 +/*
 + * As we only have the TTBR0_EL2 register, we cannot express
 + * "negative" addresses. This makes it impossible to directly share
 + * mappings with the kernel.
 + *
 + * Instead, give the HYP mode its own VA region at a fixed offset from
 + * the kernel by just masking the top bits (which are all ones for a
 + * kernel address).
>>>
>>> For some reason I keep choking on this, despite it being very simple.
>>> We're just defining a different PAGE_OFFSET, right? Why not do a hard
>>> define as:
>>>
>>> #define HYP_PAGE_OFFSET_MASK  0x
>>> #define HYP_PAGE_OFFSET   0xffc0
>>>
>>> ...or change the second paragraph of the comment to say
>>> that we definethe HYP_PAGE_OFFSET to be 0x ffc0 .
>>
>> One of these days, VA_BITS will change to accommodate for more virtual
>> space. When that day comes, I don't want to touch any of this because it
>> did hurt enough when writing it. As such, I'll refrain from hardcoding
>> anything.
>>
>> I don't mind a comment, though.
>>
 + */
 +#define HYP_PAGE_OFFSET_SHIFT   VA_BITS
 +#define HYP_PAGE_OFFSET_MASK((UL(1) << HYP_PAGE_OFFSET_SHIFT) - 1)
>>>
>>> In any case, is there a reason for the HYP_PAGE_OFFSET_SHIFT
>>> indirection? It may be simpler without...
>>
>> It is common practice to have XXX_SHIFT and XXX_MASK together.
>>
 +#define HYP_PAGE_OFFSET (PAGE_OFFSET & HYP_PAGE_OFFSET_MASK)
 +
 +/*
 + * Our virtual mapping for the idmap-ed MMU-enable code. Must be
 + * shared across all the page-tables. Conveniently, we use the last
 + * possible page, where no kernel mapping will ever exist.
 + */
 +#define TRAMPOLINE_VA   (HYP_PAGE_OFFSET_MASK & PAGE_MASK)
>>>
>>> hmmm, ok, here it's kind of nice to have that define correlation, so
>>> maybe it's not cleaner.  Something should be improved here in the define
>>> or the comment to make it more clear.  Perhaps just adding the real
>>> constants in the comment or in Documentation/arm64/memory.txt would
>>> help.
>>
>> Yes, I plan to write something there.
>>
 +
 +#ifdef __ASSEMBLY__
 +
 +/*
 + * Convert a kernel VA into a HYP VA.
 + * reg: VA to be converted.
 + */
 +.macro kern_hyp_va  reg
 +and \reg, \reg, #HYP_PAGE_OFFSET_MASK
 +.endm
 +
 +#else
 +
 +#include 
 +
 +#define KERN_TO_HYP(kva)((unsigned long)kva - PAGE_OFFSET + 
 HYP_PAGE_OFFSET)
 +
 +/*
 + * Align KVM with the kernel's view of physical memory. Should be
 + * 40bit IPA, with PGD being 8kB aligned.
 + */
 +#define KVM_PHYS_SHIFT  PHYS_MASK_SHIFT
 +#define KVM_PHYS_SIZE   (1UL << KVM_PHYS_SHIFT)
 +#define KVM_PHYS_MASK   (KVM_PHYS_SIZE - 1UL)
 +
 +#ifdef CONFIG_ARM64_64K_PAGES
 +#define PAGE_LEVELS 2
 +#define BITS_PER_LEVEL  13
 +#else  /* 4kB pages */
 +#define PAGE_LEVELS 3
 +#define BITS_PER_LEVEL  9
 +#endif
>>>
>>> What are the semantics of these defines exactly? They should be
>>> S2_PAGE_LEVELS and make some assumptions of the VTCR_EL2.SL0 field
>>> right?
>>
>> Indeed, we assume SL0 is always 1,

RE: [PATCH 1/1] kvm:book3e: Fix a build error

2013-04-25 Thread Caraman Mihai Claudiu-B02008

> -Original Message-
> From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-
> ow...@vger.kernel.org] On Behalf Of Tiejun Chen
> Sent: Thursday, April 25, 2013 2:46 PM
> To: ga...@kernel.crashing.org
> Cc: linuxppc-...@lists.ozlabs.org; kvm-...@vger.kernel.org;
> kvm@vger.kernel.org
> Subject: [PATCH 1/1] kvm:book3e: Fix a build error
> 
> Commit cd66cc2e, "powerpc/85xx: Add AltiVec support for e6500", adds
> support for AltiVec on a Book-E class processor, but while compiling
> in the CONFIG_PPC_BOOK3E_64 and CONFIG_VIRTUALIZATION case, this
> introduce the following error:
> 
> arch/powerpc/kernel/exceptions-64e.S:402: undefined reference to
> `kvmppc_handler_42_0x01B'
> arch/powerpc/kernel/built-in.o: In function `exc_altivec_assist_book3e':
> arch/powerpc/kernel/exceptions-64e.S:424: undefined reference to
> `kvmppc_handler_43_0x01B'
> make: *** [vmlinux] Error 1
> 
> Looks we should add these altivec kvm handlers.
> 
> Signed-off-by: Tiejun Chen 
> ---
>  arch/powerpc/kvm/bookehv_interrupts.S |5 +
>  1 file changed, 5 insertions(+)
> 
> diff --git a/arch/powerpc/kvm/bookehv_interrupts.S
> b/arch/powerpc/kvm/bookehv_interrupts.S
> index e8ed7d6..fa9c78a 100644
> --- a/arch/powerpc/kvm/bookehv_interrupts.S
> +++ b/arch/powerpc/kvm/bookehv_interrupts.S
> @@ -319,6 +319,11 @@ kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(DBG), \
>   SPRN_DSRR0, SPRN_DSRR1, 0
>  kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(CRIT), \
>   SPRN_CSRR0, SPRN_CSRR1, 0
> +/* altivec */
> +kvm_handler BOOKE_INTERRUPT_ALTIVEC_UNAVAIL, EX_PARAMS(GEN), \
> + SPRN_SRR0, SPRN_SRR1, 0
> +kvm_handler BOOKE_INTERRUPT_ALTIVEC_ASSIST, EX_PARAMS(GEN), \
> + SPRN_SRR0, SPRN_SRR1, 0
>  #else
>  /*
>   * For input register values, see
> arch/powerpc/include/asm/kvm_booke_hv_asm.h
> --
 
It seems that you are not using kvm-ppc-queue branch.

I already have a patch ready for this (and AltiVec support is work
in progress) but we need first to pull e6500 kernel patches from
Linux tree into agraf.git.
 
-Mike



 



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 1/6] kvm: add device control API

2013-04-25 Thread Gleb Natapov

On Thu, Apr 25, 2013 at 12:47:39PM +0200, Alexander Graf wrote:
> 
> On 25.04.2013, at 11:43, Gleb Natapov wrote:
> 
> > On Fri, Apr 12, 2013 at 07:08:42PM -0500, Scott Wood wrote:
> >> Currently, devices that are emulated inside KVM are configured in a
> >> hardcoded manner based on an assumption that any given architecture
> >> only has one way to do it.  If there's any need to access device state,
> >> it is done through inflexible one-purpose-only IOCTLs (e.g.
> >> KVM_GET/SET_LAPIC).  Defining new IOCTLs for every little thing is
> >> cumbersome and depletes a limited numberspace.
> >> 
> >> This API provides a mechanism to instantiate a device of a certain
> >> type, returning an ID that can be used to set/get attributes of the
> >> device.  Attributes may include configuration parameters (e.g.
> >> register base address), device state, operational commands, etc.  It
> >> is similar to the ONE_REG API, except that it acts on devices rather
> >> than vcpus.
> >> 
> >> Both device types and individual attributes can be tested without having
> >> to create the device or get/set the attribute, without the need for
> >> separately managing enumerated capabilities.
> >> 
> >> Signed-off-by: Scott Wood 
> >> ---
> >> v4:
> >> - Move some boilerplate back into generic code, as requested by Gleb.
> >>   File descriptor management and reference counting is no longer the
> >>   concern of the device implementation.
> >> 
> >> - Don't hold kvm->lock during create.  The original reasons
> >>   for doing so have vanished as for as MPIC is concerned, and
> >>   this avoids needing to answer the question of whether to
> >>   hold the lock during destroy as well.
> >> 
> >>   Paul, you may need to acquire the lock yourself in kvm_create_xics()
> >>   to protect the -EEXIST check.
> >> 
> >> v3: remove some changes that were merged into this patch by accident,
> >> and fix the error documentation for KVM_CREATE_DEVICE.
> >> ---
> >> Documentation/virtual/kvm/api.txt|   70 
> >> Documentation/virtual/kvm/devices/README |1 +
> >> include/linux/kvm_host.h |   35 
> >> include/uapi/linux/kvm.h |   27 +++
> >> virt/kvm/kvm_main.c  |  129 
> >> ++
> >> 5 files changed, 262 insertions(+)
> >> create mode 100644 Documentation/virtual/kvm/devices/README
> >> 
> >> diff --git a/Documentation/virtual/kvm/api.txt 
> >> b/Documentation/virtual/kvm/api.txt
> >> index 976eb65..d52f3f9 100644
> >> --- a/Documentation/virtual/kvm/api.txt
> >> +++ b/Documentation/virtual/kvm/api.txt
> >> @@ -2173,6 +2173,76 @@ header; first `n_valid' valid entries with contents 
> >> from the data
> >> written, then `n_invalid' invalid entries, invalidating any previously
> >> valid entries found.
> >> 
> >> +4.79 KVM_CREATE_DEVICE
> >> +
> >> +Capability: KVM_CAP_DEVICE_CTRL
> >> +Type: vm ioctl
> >> +Parameters: struct kvm_create_device (in/out)
> >> +Returns: 0 on success, -1 on error
> >> +Errors:
> >> +  ENODEV: The device type is unknown or unsupported
> >> +  EEXIST: Device already created, and this type of device may not
> >> +  be instantiated multiple times
> >> +
> >> +  Other error conditions may be defined by individual device types or
> >> +  have their standard meanings.
> >> +
> >> +Creates an emulated device in the kernel.  The file descriptor returned
> >> +in fd can be used with KVM_SET/GET/HAS_DEVICE_ATTR.
> >> +
> >> +If the KVM_CREATE_DEVICE_TEST flag is set, only test whether the
> >> +device type is supported (not necessarily whether it can be created
> >> +in the current vm).
> >> +
> >> +Individual devices should not define flags.  Attributes should be used
> >> +for specifying any behavior that is not implied by the device type
> >> +number.
> >> +
> >> +struct kvm_create_device {
> >> +  __u32   type;   /* in: KVM_DEV_TYPE_xxx */
> >> +  __u32   fd; /* out: device handle */
> >> +  __u32   flags;  /* in: KVM_CREATE_DEVICE_xxx */
> >> +};
> > Should we add __u32 padding here to make struct size multiple of u64?
> 
> Do you know of any arch that pads structs to u64 boundaries? x86_64 doesn't 
> and ppc64 doesn't either.
> 
Not really. I just notices that we pad some structures to that effect.

> > 
> >> +
> >> +4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR
> >> +
> >> +Capability: KVM_CAP_DEVICE_CTRL
> >> +Type: device ioctl
> >> +Parameters: struct kvm_device_attr
> >> +Returns: 0 on success, -1 on error
> >> +Errors:
> >> +  ENXIO:  The group or attribute is unknown/unsupported for this device
> >> +  EPERM:  The attribute cannot (currently) be accessed this way
> >> +  (e.g. read-only attribute, or attribute that only makes
> >> +  sense when the device is in a different state)
> >> +
> >> +  Other error conditions may be defined by individual device types.
> >> +
> >> +Gets/sets a specified piece of device configuration and/or state.  The
> >> +semantics are device-specific.  Se

[PATCH 1/1] kvm:book3e: Fix a build error

2013-04-25 Thread Tiejun Chen

Commit cd66cc2e, "powerpc/85xx: Add AltiVec support for e6500", adds
support for AltiVec on a Book-E class processor, but while compiling 
in the CONFIG_PPC_BOOK3E_64 and CONFIG_VIRTUALIZATION case, this
introduce the following error:

arch/powerpc/kernel/exceptions-64e.S:402: undefined reference to 
`kvmppc_handler_42_0x01B'
arch/powerpc/kernel/built-in.o: In function `exc_altivec_assist_book3e':
arch/powerpc/kernel/exceptions-64e.S:424: undefined reference to 
`kvmppc_handler_43_0x01B'
make: *** [vmlinux] Error 1

Looks we should add these altivec kvm handlers.

Signed-off-by: Tiejun Chen 
---
 arch/powerpc/kvm/bookehv_interrupts.S |5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/kvm/bookehv_interrupts.S 
b/arch/powerpc/kvm/bookehv_interrupts.S
index e8ed7d6..fa9c78a 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -319,6 +319,11 @@ kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(DBG), \
SPRN_DSRR0, SPRN_DSRR1, 0
 kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(CRIT), \
SPRN_CSRR0, SPRN_CSRR1, 0
+/* altivec */
+kvm_handler BOOKE_INTERRUPT_ALTIVEC_UNAVAIL, EX_PARAMS(GEN), \
+   SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_ALTIVEC_ASSIST, EX_PARAMS(GEN), \
+   SPRN_SRR0, SPRN_SRR1, 0
 #else
 /*
  * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 16/17] KVM: PPC: MPIC: Add support for KVM_IRQ_LINE

2013-04-25 Thread Alexander Graf


On 19.04.2013, at 20:51, Scott Wood wrote:

> On 04/19/2013 09:06:27 AM, Alexander Graf wrote:
>> Now that all pieces are in place for reusing generic irq infrastructure,
>> we can copy x86's implementation of KVM_IRQ_LINE irq injection and simply
>> reuse it for PPC, as it will work there just as well.
>> Signed-off-by: Alexander Graf 
>> ---
>> arch/powerpc/include/uapi/asm/kvm.h |1 +
>> arch/powerpc/kvm/powerpc.c  |   13 +
>> 2 files changed, 14 insertions(+), 0 deletions(-)
>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
>> b/arch/powerpc/include/uapi/asm/kvm.h
>> index 3537bf3..dbb2ac2 100644
>> --- a/arch/powerpc/include/uapi/asm/kvm.h
>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
>> @@ -26,6 +26,7 @@
>> #define __KVM_HAVE_SPAPR_TCE
>> #define __KVM_HAVE_PPC_SMT
>> #define __KVM_HAVE_IRQCHIP
>> +#define __KVM_HAVE_IRQ_LINE
>> struct kvm_regs {
>>  __u64 pc;
>> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
>> index c431fea..874c106 100644
>> --- a/arch/powerpc/kvm/powerpc.c
>> +++ b/arch/powerpc/kvm/powerpc.c
>> @@ -33,6 +33,7 @@
>> #include 
>> #include 
>> #include "timing.h"
>> +#include "irq.h"
>> #include "../mm/mmu_decl.h"
>> #define CREATE_TRACE_POINTS
>> @@ -945,6 +946,18 @@ static int kvm_vm_ioctl_get_pvinfo(struct 
>> kvm_ppc_pvinfo *pvinfo)
>>  return 0;
>> }
>> +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
>> +  bool line_status)
>> +{
>> +if (!irqchip_in_kernel(kvm))
>> +return -ENXIO;
>> +
>> +irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
>> +irq_event->irq, irq_event->level,
>> +line_status);
>> +return 0;
>> +}
> 
> As Paul noted in the XICS patchset, this could reference an MPIC that has 
> gone away if the user never attached any vcpus and then closed the MPIC fd.  
> It's not a reasonable use case, but it could be used malicously to get the 
> kernel to access a bad pointer.  The irqchip_in_kernel check helps somewhat, 
> but it's meant for ensuring that the creation has happened -- it's racy if 
> used for ensuring that destruction hasn't happened.
> 
> The problem is rooted in the awkwardness of performing an operation that 
> logically should be on the MPIC fd, but is instead being done on the vm fd.
> 
> I think these three steps would fix it (the first two seem like things we 
> should be doing anyway):
> - During MPIC destruction, make sure MPIC deregisters all routes that 
> reference it.
> - In kvm_set_irq(), do not release the RCU read lock until after the set() 
> function has been called.
> - Do not hook up kvm_send_userspace_msi() to MPIC or other new irqchips, as 
> that bypasses the RCU lock.  It could be supported as a device fd ioctl if 
> desired, or it could be reworked to operate on an RCU-managed list of MSI 
> handlers, though MPIC really doesn't need this at all.

Can't we just add an RCU lock in the send_userspace_msi case? I don't think we 
should handle MSIs any differently from normal IRQs.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 1/6] kvm: add device control API

2013-04-25 Thread Alexander Graf


On 25.04.2013, at 11:43, Gleb Natapov wrote:

> On Fri, Apr 12, 2013 at 07:08:42PM -0500, Scott Wood wrote:
>> Currently, devices that are emulated inside KVM are configured in a
>> hardcoded manner based on an assumption that any given architecture
>> only has one way to do it.  If there's any need to access device state,
>> it is done through inflexible one-purpose-only IOCTLs (e.g.
>> KVM_GET/SET_LAPIC).  Defining new IOCTLs for every little thing is
>> cumbersome and depletes a limited numberspace.
>> 
>> This API provides a mechanism to instantiate a device of a certain
>> type, returning an ID that can be used to set/get attributes of the
>> device.  Attributes may include configuration parameters (e.g.
>> register base address), device state, operational commands, etc.  It
>> is similar to the ONE_REG API, except that it acts on devices rather
>> than vcpus.
>> 
>> Both device types and individual attributes can be tested without having
>> to create the device or get/set the attribute, without the need for
>> separately managing enumerated capabilities.
>> 
>> Signed-off-by: Scott Wood 
>> ---
>> v4:
>> - Move some boilerplate back into generic code, as requested by Gleb.
>>   File descriptor management and reference counting is no longer the
>>   concern of the device implementation.
>> 
>> - Don't hold kvm->lock during create.  The original reasons
>>   for doing so have vanished as for as MPIC is concerned, and
>>   this avoids needing to answer the question of whether to
>>   hold the lock during destroy as well.
>> 
>>   Paul, you may need to acquire the lock yourself in kvm_create_xics()
>>   to protect the -EEXIST check.
>> 
>> v3: remove some changes that were merged into this patch by accident,
>> and fix the error documentation for KVM_CREATE_DEVICE.
>> ---
>> Documentation/virtual/kvm/api.txt|   70 
>> Documentation/virtual/kvm/devices/README |1 +
>> include/linux/kvm_host.h |   35 
>> include/uapi/linux/kvm.h |   27 +++
>> virt/kvm/kvm_main.c  |  129 
>> ++
>> 5 files changed, 262 insertions(+)
>> create mode 100644 Documentation/virtual/kvm/devices/README
>> 
>> diff --git a/Documentation/virtual/kvm/api.txt 
>> b/Documentation/virtual/kvm/api.txt
>> index 976eb65..d52f3f9 100644
>> --- a/Documentation/virtual/kvm/api.txt
>> +++ b/Documentation/virtual/kvm/api.txt
>> @@ -2173,6 +2173,76 @@ header; first `n_valid' valid entries with contents 
>> from the data
>> written, then `n_invalid' invalid entries, invalidating any previously
>> valid entries found.
>> 
>> +4.79 KVM_CREATE_DEVICE
>> +
>> +Capability: KVM_CAP_DEVICE_CTRL
>> +Type: vm ioctl
>> +Parameters: struct kvm_create_device (in/out)
>> +Returns: 0 on success, -1 on error
>> +Errors:
>> +  ENODEV: The device type is unknown or unsupported
>> +  EEXIST: Device already created, and this type of device may not
>> +  be instantiated multiple times
>> +
>> +  Other error conditions may be defined by individual device types or
>> +  have their standard meanings.
>> +
>> +Creates an emulated device in the kernel.  The file descriptor returned
>> +in fd can be used with KVM_SET/GET/HAS_DEVICE_ATTR.
>> +
>> +If the KVM_CREATE_DEVICE_TEST flag is set, only test whether the
>> +device type is supported (not necessarily whether it can be created
>> +in the current vm).
>> +
>> +Individual devices should not define flags.  Attributes should be used
>> +for specifying any behavior that is not implied by the device type
>> +number.
>> +
>> +struct kvm_create_device {
>> +__u32   type;   /* in: KVM_DEV_TYPE_xxx */
>> +__u32   fd; /* out: device handle */
>> +__u32   flags;  /* in: KVM_CREATE_DEVICE_xxx */
>> +};
> Should we add __u32 padding here to make struct size multiple of u64?

Do you know of any arch that pads structs to u64 boundaries? x86_64 doesn't and 
ppc64 doesn't either.

> 
>> +
>> +4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR
>> +
>> +Capability: KVM_CAP_DEVICE_CTRL
>> +Type: device ioctl
>> +Parameters: struct kvm_device_attr
>> +Returns: 0 on success, -1 on error
>> +Errors:
>> +  ENXIO:  The group or attribute is unknown/unsupported for this device
>> +  EPERM:  The attribute cannot (currently) be accessed this way
>> +  (e.g. read-only attribute, or attribute that only makes
>> +  sense when the device is in a different state)
>> +
>> +  Other error conditions may be defined by individual device types.
>> +
>> +Gets/sets a specified piece of device configuration and/or state.  The
>> +semantics are device-specific.  See individual device documentation in
>> +the "devices" directory.  As with ONE_REG, the size of the data
>> +transferred is defined by the particular attribute.
>> +
>> +struct kvm_device_attr {
>> +__u32   flags;  /* no flags currently defined */
>> +__u32   group;  /* device-defined */
>> +__u64   attr;   /* group-

Re: [PATCH 00/17] KVM: PPC: In-kernel MPIC support with irqfd v3

2013-04-25 Thread Michael S. Tsirkin

On Fri, Apr 19, 2013 at 04:06:11PM +0200, Alexander Graf wrote:
> Hi,
> 
> This patch set contains a fully working implementation of the in-kernel MPIC
> from Scott with a few fixups and a new version of my irqfd generalization
> patch set.

For patches 1-8:
Acked-by: Michael S. Tsirkin 

I don't have an opinion about the rest.

> v1 -> v2:
> 
>   - depend on CONFIG_ defines rather than __KVM defines
>   - fix compile issues
>   - fix the kvm_irqchip{,s} typo
> 
> v2 -> v3:
> 
>   - make mpic pointer type safe
>   - add wmb before setting global mpic variable
>   - make eoi notification happen unlockedly
>   - add IRQ routing documentation
>   - announce mpic availability after its creation
>   - fix pr_debug again
> 
> I have refrained from touching IA64 at all in this patch set. It's marked
> as BROKEN, I doubt it even compiles at all today. The only sensible thing
> to do would be to remove all of IA64 kvm code from the kernel tree, but
> that is out of scope for this patch set and definitely should not gate it.
> 
> 
> Alex
> 
> Alexander Graf (11):
>   KVM: Add KVM_IRQCHIP_NUM_PINS in addition to KVM_IOAPIC_NUM_PINS
>   KVM: Introduce CONFIG_HAVE_KVM_IRQ_ROUTING
>   KVM: Drop __KVM_HAVE_IOAPIC condition on irq routing
>   KVM: Remove kvm_get_intr_delivery_bitmask
>   KVM: Move irq routing to generic code
>   KVM: Extract generic irqchip logic into irqchip.c
>   KVM: Move irq routing setup to irqchip.c
>   KVM: Move irqfd resample cap handling to generic code
>   KVM: PPC: Support irq routing and irqfd for in-kernel MPIC
>   KVM: PPC: MPIC: Add support for KVM_IRQ_LINE
>   KVM: PPC: MPIC: Restrict to e500 platforms
> 
> Scott Wood (6):
>   kvm: add device control API
>   kvm/ppc/mpic: import hw/openpic.c from QEMU
>   kvm/ppc/mpic: remove some obviously unneeded code
>   kvm/ppc/mpic: adapt to kernel style and environment
>   kvm/ppc/mpic: in-kernel MPIC emulation
>   kvm/ppc/mpic: add KVM_CAP_IRQ_MPIC
> 
>  Documentation/virtual/kvm/api.txt  |   78 ++
>  Documentation/virtual/kvm/devices/README   |1 +
>  Documentation/virtual/kvm/devices/mpic.txt |   48 +
>  arch/powerpc/include/asm/kvm_host.h|   24 +-
>  arch/powerpc/include/asm/kvm_ppc.h |   30 +
>  arch/powerpc/include/uapi/asm/kvm.h|9 +
>  arch/powerpc/kvm/Kconfig   |   12 +
>  arch/powerpc/kvm/Makefile  |3 +
>  arch/powerpc/kvm/booke.c   |   12 +-
>  arch/powerpc/kvm/irq.h |   17 +
>  arch/powerpc/kvm/mpic.c| 1876 
> 
>  arch/powerpc/kvm/powerpc.c |   55 +-
>  arch/x86/include/asm/kvm_host.h|2 +
>  arch/x86/kvm/Kconfig   |1 +
>  arch/x86/kvm/Makefile  |2 +-
>  arch/x86/kvm/x86.c |1 -
>  include/linux/kvm_host.h   |   53 +-
>  include/trace/events/kvm.h |   12 +-
>  include/uapi/linux/kvm.h   |   33 +-
>  virt/kvm/Kconfig   |3 +
>  virt/kvm/assigned-dev.c|   30 -
>  virt/kvm/eventfd.c |6 +-
>  virt/kvm/irq_comm.c|  194 +---
>  virt/kvm/irqchip.c |  237 
>  virt/kvm/kvm_main.c|  170 +++-
>  25 files changed, 2659 insertions(+), 250 deletions(-)
>  create mode 100644 Documentation/virtual/kvm/devices/README
>  create mode 100644 Documentation/virtual/kvm/devices/mpic.txt
>  create mode 100644 arch/powerpc/kvm/irq.h
>  create mode 100644 arch/powerpc/kvm/mpic.c
>  create mode 100644 virt/kvm/irqchip.c
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 08/17] KVM: Move irqfd resample cap handling to generic code

2013-04-25 Thread Michael S. Tsirkin

On Fri, Apr 19, 2013 at 04:06:19PM +0200, Alexander Graf wrote:
> Now that we have most irqfd code completely platform agnostic, let's move
> irqfd's resample capability return to generic code as well.
> 
> Signed-off-by: Alexander Graf 

Acked-by: Michael S. Tsirkin 

> ---
>  arch/x86/kvm/x86.c  |1 -
>  virt/kvm/kvm_main.c |3 +++
>  2 files changed, 3 insertions(+), 1 deletions(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 50e2e10..888d892 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2513,7 +2513,6 @@ int kvm_dev_ioctl_check_extension(long ext)
>   case KVM_CAP_PCI_2_3:
>   case KVM_CAP_KVMCLOCK_CTRL:
>   case KVM_CAP_READONLY_MEM:
> - case KVM_CAP_IRQFD_RESAMPLE:
>   r = 1;
>   break;
>   case KVM_CAP_COALESCED_MMIO:
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index b6f3354..f9492f3 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2433,6 +2433,9 @@ static long kvm_dev_ioctl_check_extension_generic(long 
> arg)
>  #ifdef CONFIG_HAVE_KVM_MSI
>   case KVM_CAP_SIGNAL_MSI:
>  #endif
> +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
> + case KVM_CAP_IRQFD_RESAMPLE:
> +#endif
>   return 1;
>  #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
>   case KVM_CAP_IRQ_ROUTING:
> -- 
> 1.6.0.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 07/17] KVM: Move irq routing setup to irqchip.c

2013-04-25 Thread Michael S. Tsirkin

On Fri, Apr 19, 2013 at 04:06:18PM +0200, Alexander Graf wrote:
> Setting up IRQ routes is nothing IOAPIC specific. Extract everything
> that really is generic code into irqchip.c and only leave the ioapic
> specific bits to irq_comm.c.
> 
> Signed-off-by: Alexander Graf 

Acked-by: Michael S. Tsirkin 

> ---
>  include/linux/kvm_host.h |3 ++
>  virt/kvm/irq_comm.c  |   76 ++---
>  virt/kvm/irqchip.c   |   85 
> ++
>  3 files changed, 91 insertions(+), 73 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index a7bfe9d..dcef724 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -961,6 +961,9 @@ int kvm_set_irq_routing(struct kvm *kvm,
>   const struct kvm_irq_routing_entry *entries,
>   unsigned nr,
>   unsigned flags);
> +int kvm_set_routing_entry(struct kvm_irq_routing_table *rt,
> +   struct kvm_kernel_irq_routing_entry *e,
> +   const struct kvm_irq_routing_entry *ue);
>  void kvm_free_irq_routing(struct kvm *kvm);
>  
>  int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);
> diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> index d5008f4..e2e6b44 100644
> --- a/virt/kvm/irq_comm.c
> +++ b/virt/kvm/irq_comm.c
> @@ -271,27 +271,14 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned 
> irqchip, unsigned pin,
>   rcu_read_unlock();
>  }
>  
> -static int setup_routing_entry(struct kvm_irq_routing_table *rt,
> -struct kvm_kernel_irq_routing_entry *e,
> -const struct kvm_irq_routing_entry *ue)
> +int kvm_set_routing_entry(struct kvm_irq_routing_table *rt,
> +   struct kvm_kernel_irq_routing_entry *e,
> +   const struct kvm_irq_routing_entry *ue)
>  {
>   int r = -EINVAL;
>   int delta;
>   unsigned max_pin;
> - struct kvm_kernel_irq_routing_entry *ei;
>  
> - /*
> -  * Do not allow GSI to be mapped to the same irqchip more than once.
> -  * Allow only one to one mapping between GSI and MSI.
> -  */
> - hlist_for_each_entry(ei, &rt->map[ue->gsi], link)
> - if (ei->type == KVM_IRQ_ROUTING_MSI ||
> - ue->type == KVM_IRQ_ROUTING_MSI ||
> - ue->u.irqchip.irqchip == ei->irqchip.irqchip)
> - return r;
> -
> - e->gsi = ue->gsi;
> - e->type = ue->type;
>   switch (ue->type) {
>   case KVM_IRQ_ROUTING_IRQCHIP:
>   delta = 0;
> @@ -328,68 +315,11 @@ static int setup_routing_entry(struct 
> kvm_irq_routing_table *rt,
>   goto out;
>   }
>  
> - hlist_add_head(&e->link, &rt->map[e->gsi]);
>   r = 0;
>  out:
>   return r;
>  }
>  
> -int kvm_set_irq_routing(struct kvm *kvm,
> - const struct kvm_irq_routing_entry *ue,
> - unsigned nr,
> - unsigned flags)
> -{
> - struct kvm_irq_routing_table *new, *old;
> - u32 i, j, nr_rt_entries = 0;
> - int r;
> -
> - for (i = 0; i < nr; ++i) {
> - if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES)
> - return -EINVAL;
> - nr_rt_entries = max(nr_rt_entries, ue[i].gsi);
> - }
> -
> - nr_rt_entries += 1;
> -
> - new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head))
> -   + (nr * sizeof(struct kvm_kernel_irq_routing_entry)),
> -   GFP_KERNEL);
> -
> - if (!new)
> - return -ENOMEM;
> -
> - new->rt_entries = (void *)&new->map[nr_rt_entries];
> -
> - new->nr_rt_entries = nr_rt_entries;
> - for (i = 0; i < 3; i++)
> - for (j = 0; j < KVM_IRQCHIP_NUM_PINS; j++)
> - new->chip[i][j] = -1;
> -
> - for (i = 0; i < nr; ++i) {
> - r = -EINVAL;
> - if (ue->flags)
> - goto out;
> - r = setup_routing_entry(new, &new->rt_entries[i], ue);
> - if (r)
> - goto out;
> - ++ue;
> - }
> -
> - mutex_lock(&kvm->irq_lock);
> - old = kvm->irq_routing;
> - kvm_irq_routing_update(kvm, new);
> - mutex_unlock(&kvm->irq_lock);
> -
> - synchronize_rcu();
> -
> - new = old;
> - r = 0;
> -
> -out:
> - kfree(new);
> - return r;
> -}
> -
>  #define IOAPIC_ROUTING_ENTRY(irq) \
>   { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,  \
> .u.irqchip.irqchip = KVM_IRQCHIP_IOAPIC, .u.irqchip.pin = (irq) }
> diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
> index 12f7f26..20dc9e4 100644
> --- a/virt/kvm/irqchip.c
> +++ b/virt/kvm/irqchip.c
> @@ -150,3 +150,88 @@ void kvm_free_irq_routing(struct kvm *kvm)
>  at this stage */
>   kfree(kvm->irq_routing);
>  }
> +
> +static int setup_routing_entry(struct kvm_irq_routing

Re: [PATCH 06/17] KVM: Extract generic irqchip logic into irqchip.c

2013-04-25 Thread Michael S. Tsirkin

On Fri, Apr 19, 2013 at 04:06:17PM +0200, Alexander Graf wrote:
> The current irq_comm.c file contains pieces of code that are generic
> across different irqchip implementations, as well as code that is
> fully IOAPIC specific.
> 
> Split the generic bits out into irqchip.c.
> 
> Signed-off-by: Alexander Graf 

Acked-by: Michael S. Tsirkin 

> ---
>  arch/x86/kvm/Makefile  |2 +-
>  include/trace/events/kvm.h |   12 +++-
>  virt/kvm/irq_comm.c|  118 --
>  virt/kvm/irqchip.c |  152 
> 
>  4 files changed, 163 insertions(+), 121 deletions(-)
>  create mode 100644 virt/kvm/irqchip.c
> 
> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> index 04d3040..a797b8e 100644
> --- a/arch/x86/kvm/Makefile
> +++ b/arch/x86/kvm/Makefile
> @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
>  
>  kvm-y+= $(addprefix ../../../virt/kvm/, kvm_main.o 
> ioapic.o \
>   coalesced_mmio.o irq_comm.o eventfd.o \
> - assigned-dev.o)
> + assigned-dev.o irqchip.o)
>  kvm-$(CONFIG_IOMMU_API)  += $(addprefix ../../../virt/kvm/, iommu.o)
>  kvm-$(CONFIG_KVM_ASYNC_PF)   += $(addprefix ../../../virt/kvm/, async_pf.o)
>  
> diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
> index 19911dd..7005d11 100644
> --- a/include/trace/events/kvm.h
> +++ b/include/trace/events/kvm.h
> @@ -37,7 +37,7 @@ TRACE_EVENT(kvm_userspace_exit,
> __entry->errno < 0 ? -__entry->errno : __entry->reason)
>  );
>  
> -#if defined(__KVM_HAVE_IRQ_LINE)
> +#if defined(CONFIG_HAVE_KVM_IRQCHIP)
>  TRACE_EVENT(kvm_set_irq,
>   TP_PROTO(unsigned int gsi, int level, int irq_source_id),
>   TP_ARGS(gsi, level, irq_source_id),
> @@ -122,6 +122,10 @@ TRACE_EVENT(kvm_msi_set_irq,
>   {KVM_IRQCHIP_PIC_SLAVE, "PIC slave"},   \
>   {KVM_IRQCHIP_IOAPIC,"IOAPIC"}
>  
> +#endif /* defined(__KVM_HAVE_IOAPIC) */
> +
> +#if defined(CONFIG_HAVE_KVM_IRQCHIP)
> +
>  TRACE_EVENT(kvm_ack_irq,
>   TP_PROTO(unsigned int irqchip, unsigned int pin),
>   TP_ARGS(irqchip, pin),
> @@ -136,14 +140,18 @@ TRACE_EVENT(kvm_ack_irq,
>   __entry->pin= pin;
>   ),
>  
> +#ifdef kvm_irqchips
>   TP_printk("irqchip %s pin %u",
> __print_symbolic(__entry->irqchip, kvm_irqchips),
>__entry->pin)
> +#else
> + TP_printk("irqchip %d pin %u", __entry->irqchip, __entry->pin)
> +#endif
>  );
>  
> +#endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */
>  
>  
> -#endif /* defined(__KVM_HAVE_IOAPIC) */
>  
>  #define KVM_TRACE_MMIO_READ_UNSATISFIED 0
>  #define KVM_TRACE_MMIO_READ 1
> diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> index 7c0071d..d5008f4 100644
> --- a/virt/kvm/irq_comm.c
> +++ b/virt/kvm/irq_comm.c
> @@ -151,59 +151,6 @@ static int kvm_set_msi_inatomic(struct 
> kvm_kernel_irq_routing_entry *e,
>   return -EWOULDBLOCK;
>  }
>  
> -int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
> -{
> - struct kvm_kernel_irq_routing_entry route;
> -
> - if (!irqchip_in_kernel(kvm) || msi->flags != 0)
> - return -EINVAL;
> -
> - route.msi.address_lo = msi->address_lo;
> - route.msi.address_hi = msi->address_hi;
> - route.msi.data = msi->data;
> -
> - return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, false);
> -}
> -
> -/*
> - * Return value:
> - *  < 0   Interrupt was ignored (masked or not delivered for other reasons)
> - *  = 0   Interrupt was coalesced (previous irq is still pending)
> - *  > 0   Number of CPUs interrupt was delivered to
> - */
> -int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
> - bool line_status)
> -{
> - struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
> - int ret = -1, i = 0;
> - struct kvm_irq_routing_table *irq_rt;
> -
> - trace_kvm_set_irq(irq, level, irq_source_id);
> -
> - /* Not possible to detect if the guest uses the PIC or the
> -  * IOAPIC.  So set the bit in both. The guest will ignore
> -  * writes to the unused one.
> -  */
> - rcu_read_lock();
> - irq_rt = rcu_dereference(kvm->irq_routing);
> - if (irq < irq_rt->nr_rt_entries)
> - hlist_for_each_entry(e, &irq_rt->map[irq], link)
> - irq_set[i++] = *e;
> - rcu_read_unlock();
> -
> - while(i--) {
> - int r;
> - r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level,
> - line_status);
> - if (r < 0)
> - continue;
> -
> - ret = r + ((ret < 0) ? 0 : ret);
> - }
> -
> - return ret;
> -}
> -
>  /*
>   * Deliver an IRQ in an atomic context if we can, or return a failure,
>   * user can retry in a process context.
> @@ -241,63 +188,6 @@ int kvm_set_irq_inatomic(st

Re: [PATCH 04/17] KVM: Remove kvm_get_intr_delivery_bitmask

2013-04-25 Thread Michael S. Tsirkin

On Fri, Apr 19, 2013 at 04:06:15PM +0200, Alexander Graf wrote:
> The prototype has been stale for a while, I can't spot any real function
> define behind it. Let's just remove it.
> 
> Signed-off-by: Alexander Graf 

Acked-by: Michael S. Tsirkin 

> ---
>  include/linux/kvm_host.h |5 -
>  1 files changed, 0 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 4215d4f..a7bfe9d 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -719,11 +719,6 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, 
> int irq,
>  void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
>bool mask);
>  
> -#ifdef __KVM_HAVE_IOAPIC
> -void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
> -union kvm_ioapic_redirect_entry *entry,
> -unsigned long *deliver_bitmask);
> -#endif
>  int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
>   bool line_status);
>  int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int 
> level);
> -- 
> 1.6.0.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 05/17] KVM: Move irq routing to generic code

2013-04-25 Thread Michael S. Tsirkin

On Fri, Apr 19, 2013 at 04:06:16PM +0200, Alexander Graf wrote:
> The IRQ routing set ioctl lives in the hacky device assignment code inside
> of KVM today. This is definitely the wrong place for it. Move it to the much
> more natural kvm_main.c.
> 
> Signed-off-by: Alexander Graf 

Acked-by: Michael S. Tsirkin 

> ---
>  virt/kvm/assigned-dev.c |   30 --
>  virt/kvm/kvm_main.c |   30 ++
>  2 files changed, 30 insertions(+), 30 deletions(-)
> 
> diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> index f4c7f59..8db4370 100644
> --- a/virt/kvm/assigned-dev.c
> +++ b/virt/kvm/assigned-dev.c
> @@ -983,36 +983,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, 
> unsigned ioctl,
>   goto out;
>   break;
>   }
> -#ifdef KVM_CAP_IRQ_ROUTING
> - case KVM_SET_GSI_ROUTING: {
> - struct kvm_irq_routing routing;
> - struct kvm_irq_routing __user *urouting;
> - struct kvm_irq_routing_entry *entries;
> -
> - r = -EFAULT;
> - if (copy_from_user(&routing, argp, sizeof(routing)))
> - goto out;
> - r = -EINVAL;
> - if (routing.nr >= KVM_MAX_IRQ_ROUTES)
> - goto out;
> - if (routing.flags)
> - goto out;
> - r = -ENOMEM;
> - entries = vmalloc(routing.nr * sizeof(*entries));
> - if (!entries)
> - goto out;
> - r = -EFAULT;
> - urouting = argp;
> - if (copy_from_user(entries, urouting->entries,
> -routing.nr * sizeof(*entries)))
> - goto out_free_irq_routing;
> - r = kvm_set_irq_routing(kvm, entries, routing.nr,
> - routing.flags);
> - out_free_irq_routing:
> - vfree(entries);
> - break;
> - }
> -#endif /* KVM_CAP_IRQ_ROUTING */
>  #ifdef __KVM_HAVE_MSIX
>   case KVM_ASSIGN_SET_MSIX_NR: {
>   struct kvm_assigned_msix_nr entry_nr;
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 2c3b226..b6f3354 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2274,6 +2274,36 @@ static long kvm_vm_ioctl(struct file *filp,
>   break;
>   }
>  #endif
> +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
> + case KVM_SET_GSI_ROUTING: {
> + struct kvm_irq_routing routing;
> + struct kvm_irq_routing __user *urouting;
> + struct kvm_irq_routing_entry *entries;
> +
> + r = -EFAULT;
> + if (copy_from_user(&routing, argp, sizeof(routing)))
> + goto out;
> + r = -EINVAL;
> + if (routing.nr >= KVM_MAX_IRQ_ROUTES)
> + goto out;
> + if (routing.flags)
> + goto out;
> + r = -ENOMEM;
> + entries = vmalloc(routing.nr * sizeof(*entries));
> + if (!entries)
> + goto out;
> + r = -EFAULT;
> + urouting = argp;
> + if (copy_from_user(entries, urouting->entries,
> +routing.nr * sizeof(*entries)))
> + goto out_free_irq_routing;
> + r = kvm_set_irq_routing(kvm, entries, routing.nr,
> + routing.flags);
> + out_free_irq_routing:
> + vfree(entries);
> + break;
> + }
> +#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
>   default:
>   r = kvm_arch_vm_ioctl(filp, ioctl, arg);
>   if (r == -ENOTTY)
> -- 
> 1.6.0.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 03/17] KVM: Drop __KVM_HAVE_IOAPIC condition on irq routing

2013-04-25 Thread Michael S. Tsirkin

On Fri, Apr 19, 2013 at 04:06:14PM +0200, Alexander Graf wrote:
> We have a capability enquire system that allows user space to ask kvm
> whether a feature is available.
> 
> The point behind this system is that we can have different kernel
> configurations with different capabilities and user space can adjust
> accordingly.
> 
> Because features can always be non existent, we can drop any #ifdefs
> on CAP defines that could be used generically, like the irq routing
> bits. These can be easily reused for non-IOAPIC systems as well.
> 
> Signed-off-by: Alexander Graf 

Acked-by: Michael S. Tsirkin 

> ---
>  include/uapi/linux/kvm.h |2 --
>  1 files changed, 0 insertions(+), 2 deletions(-)
> 
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 74d0ff3..c741902 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -579,9 +579,7 @@ struct kvm_ppc_smmu_info {
>  #ifdef __KVM_HAVE_PIT
>  #define KVM_CAP_REINJECT_CONTROL 24
>  #endif
> -#ifdef __KVM_HAVE_IOAPIC
>  #define KVM_CAP_IRQ_ROUTING 25
> -#endif
>  #define KVM_CAP_IRQ_INJECT_STATUS 26
>  #ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
>  #define KVM_CAP_DEVICE_DEASSIGNMENT 27
> -- 
> 1.6.0.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 02/17] KVM: Introduce CONFIG_HAVE_KVM_IRQ_ROUTING

2013-04-25 Thread Michael S. Tsirkin

On Fri, Apr 19, 2013 at 04:06:13PM +0200, Alexander Graf wrote:
> Quite a bit of code in KVM has been conditionalized on availability of
> IOAPIC emulation. However, most of it is generically applicable to
> platforms that don't have an IOPIC, but a different type of irq chip.
> 
> Make code that only relies on IRQ routing, not an APIC itself, on
> CONFIG_HAVE_KVM_IRQ_ROUTING, so that we can reuse it later.
> 
> Signed-off-by: Alexander Graf 

Acked-by: Michael S. Tsirkin 

> ---
>  arch/x86/kvm/Kconfig |1 +
>  include/linux/kvm_host.h |6 +++---
>  virt/kvm/Kconfig |3 +++
>  virt/kvm/eventfd.c   |6 +++---
>  virt/kvm/kvm_main.c  |2 +-
>  5 files changed, 11 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index 586f000..9d50efd 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -29,6 +29,7 @@ config KVM
>   select MMU_NOTIFIER
>   select ANON_INODES
>   select HAVE_KVM_IRQCHIP
> + select HAVE_KVM_IRQ_ROUTING
>   select HAVE_KVM_EVENTFD
>   select KVM_APIC_ARCHITECTURE
>   select KVM_ASYNC_PF
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index bf3b1dc..4215d4f 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -304,7 +304,7 @@ struct kvm_kernel_irq_routing_entry {
>   struct hlist_node link;
>  };
>  
> -#ifdef __KVM_HAVE_IOAPIC
> +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
>  
>  struct kvm_irq_routing_table {
>   int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
> @@ -432,7 +432,7 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
>  int __must_check vcpu_load(struct kvm_vcpu *vcpu);
>  void vcpu_put(struct kvm_vcpu *vcpu);
>  
> -#ifdef __KVM_HAVE_IOAPIC
> +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
>  int kvm_irqfd_init(void);
>  void kvm_irqfd_exit(void);
>  #else
> @@ -957,7 +957,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, 
> unsigned long mmu_seq)
>  }
>  #endif
>  
> -#ifdef KVM_CAP_IRQ_ROUTING
> +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
>  
>  #define KVM_MAX_IRQ_ROUTES 1024
>  
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index d01b24b..779262f 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -6,6 +6,9 @@ config HAVE_KVM
>  config HAVE_KVM_IRQCHIP
> bool
>  
> +config HAVE_KVM_IRQ_ROUTING
> +   bool
> +
>  config HAVE_KVM_EVENTFD
> bool
> select EVENTFD
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index c5d43ff..64ee720 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -35,7 +35,7 @@
>  
>  #include "iodev.h"
>  
> -#ifdef __KVM_HAVE_IOAPIC
> +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
>  /*
>   * 
>   * irqfd: Allows an fd to be used to inject an interrupt to the guest
> @@ -433,7 +433,7 @@ fail:
>  void
>  kvm_eventfd_init(struct kvm *kvm)
>  {
> -#ifdef __KVM_HAVE_IOAPIC
> +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
>   spin_lock_init(&kvm->irqfds.lock);
>   INIT_LIST_HEAD(&kvm->irqfds.items);
>   INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
> @@ -442,7 +442,7 @@ kvm_eventfd_init(struct kvm *kvm)
>   INIT_LIST_HEAD(&kvm->ioeventfds);
>  }
>  
> -#ifdef __KVM_HAVE_IOAPIC
> +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
>  /*
>   * shutdown any irqfd's that match fd+gsi
>   */
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index aaac1a7..2c3b226 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2404,7 +2404,7 @@ static long kvm_dev_ioctl_check_extension_generic(long 
> arg)
>   case KVM_CAP_SIGNAL_MSI:
>  #endif
>   return 1;
> -#ifdef KVM_CAP_IRQ_ROUTING
> +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
>   case KVM_CAP_IRQ_ROUTING:
>   return KVM_MAX_IRQ_ROUTES;
>  #endif
> -- 
> 1.6.0.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 01/17] KVM: Add KVM_IRQCHIP_NUM_PINS in addition to KVM_IOAPIC_NUM_PINS

2013-04-25 Thread Michael S. Tsirkin

On Fri, Apr 19, 2013 at 04:06:12PM +0200, Alexander Graf wrote:
> The concept of routing interrupt lines to an irqchip is nothing
> that is IOAPIC specific. Every irqchip has a maximum number of pins
> that can be linked to irq lines.
> 
> So let's add a new define that allows us to reuse generic code for
> non-IOAPIC platforms.
> 
> Signed-off-by: Alexander Graf 

Acked-by: Michael S. Tsirkin 

> ---
>  arch/x86/include/asm/kvm_host.h |2 ++
>  include/linux/kvm_host.h|2 +-
>  virt/kvm/irq_comm.c |2 +-
>  3 files changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 599f98b..f44c3fe 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -43,6 +43,8 @@
>  #define KVM_PIO_PAGE_OFFSET 1
>  #define KVM_COALESCED_MMIO_PAGE_OFFSET 2
>  
> +#define KVM_IRQCHIP_NUM_PINS  KVM_IOAPIC_NUM_PINS
> +
>  #define CR0_RESERVED_BITS   \
>   (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
> | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 93a5005..bf3b1dc 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -307,7 +307,7 @@ struct kvm_kernel_irq_routing_entry {
>  #ifdef __KVM_HAVE_IOAPIC
>  
>  struct kvm_irq_routing_table {
> - int chip[KVM_NR_IRQCHIPS][KVM_IOAPIC_NUM_PINS];
> + int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
>   struct kvm_kernel_irq_routing_entry *rt_entries;
>   u32 nr_rt_entries;
>   /*
> diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> index 25ab480..7c0071d 100644
> --- a/virt/kvm/irq_comm.c
> +++ b/virt/kvm/irq_comm.c
> @@ -480,7 +480,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
>  
>   new->nr_rt_entries = nr_rt_entries;
>   for (i = 0; i < 3; i++)
> - for (j = 0; j < KVM_IOAPIC_NUM_PINS; j++)
> + for (j = 0; j < KVM_IRQCHIP_NUM_PINS; j++)
>   new->chip[i][j] = -1;
>  
>   for (i = 0; i < nr; ++i) {
> -- 
> 1.6.0.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 15/17] KVM: PPC: Support irq routing and irqfd for in-kernel MPIC

2013-04-25 Thread Alexander Graf


On 23.04.2013, at 08:38, Paul Mackerras wrote:

> On Fri, Apr 19, 2013 at 04:06:26PM +0200, Alexander Graf wrote:
>> Now that all the irq routing and irqfd pieces are generic, we can expose
>> real irqchip support to all of KVM's internal helpers.
>> 
>> This allows us to use irqfd with the in-kernel MPIC.
> 
> [snip]
>> diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
>> index 10bc08a..d137df8 100644
>> --- a/arch/powerpc/kvm/mpic.c
>> +++ b/arch/powerpc/kvm/mpic.c
> [snip]
>> +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
>> +struct kvm *kvm, int irq_source_id, int level, bool line_status)
> [snip]
>> +int kvm_set_routing_entry(struct kvm_irq_routing_table *rt,
>> +  struct kvm_kernel_irq_routing_entry *e,
>> +  const struct kvm_irq_routing_entry *ue)
> 
> How do you see this working once we have more than one interrupt
> controller emulation in the kernel?  Presumably these two will have to
> move out to a common file, rather than being in mpic.c, but then the
> question is how do we know which interrupt controller to send the GSI
> to?  Were you thinking we would have a restriction that you can only
> instantiate one interrupt controller of any type?  Or were you
> thinking we would have an enum for kvm_irq_routing_irqchip::irqchip?
> In that case how would we handle MSIs?

In a first version of having 2 interrupt controllers, I'd make them mutually 
exclusive in Kconfig. That way each interrupt controller implements these 
functions itself.

Later we can sit down and generalize this support. Then we would need to have a 
mapping table which irqchip type each irqchip number is and call the respective 
functions.

But the use for that is so incredibly slim and the user space API would still 
be the same, that I don't think we need to worry about it today.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 15/17] KVM: PPC: Support irq routing and irqfd for in-kernel MPIC

2013-04-25 Thread Alexander Graf


On 19.04.2013, at 20:02, Scott Wood wrote:

> On 04/19/2013 09:06:26 AM, Alexander Graf wrote:
>> diff --git a/Documentation/virtual/kvm/devices/mpic.txt 
>> b/Documentation/virtual/kvm/devices/mpic.txt
>> index ce98e32..dadc1e0 100644
>> --- a/Documentation/virtual/kvm/devices/mpic.txt
>> +++ b/Documentation/virtual/kvm/devices/mpic.txt
>> @@ -35,3 +35,14 @@ Groups:
>> "attr" is the IRQ number.  IRQ numbers for standard sources are the
>> byte offset of the relevant IVPR from EIVPR0, divided by 32.
>> +
>> +IRQ Routing:
>> +
>> +  The MPIC emulation supports IRQ routing. Only a single MPIC device can
>> +  be instantiated. Once that device has been created, it's available as
>> +  irqchip id 0.
>> +
> 
>> +  This irqchip 0 has 256 interrupt pins. These pins reflect the SRC pins
>> +  on the MPIC controller.
> 
> This irqchip 0 has 256 interrupt pins, which expose the interrupts in the 
> main array of interrupt sources (a.k.a. "SRC" interrupts).  The numbering is 
> the same as the MPIC device tree binding -- based on the register offset from 
> the beginning of the sources array, without regard to any subdivisions in 
> chip documentation such as "internal" or "external" interrupts.  Default 
> routes are established for these pins, with the GSI being equal to the pin 
> number.
> 
>> +  Access to on-SRC registers is not implemented through IRQ routing 
>> mechanisms.
> 
> s/on-SRC registers/non-SRC interrupts/
> 
>> diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
>> index 10bc08a..d137df8 100644
>> --- a/arch/powerpc/kvm/mpic.c
>> +++ b/arch/powerpc/kvm/mpic.c
>> @@ -1029,6 +1029,7 @@ static int openpic_cpu_write_internal(void *opaque, 
>> gpa_t addr,
>>  struct irq_source *src;
>>  struct irq_dest *dst;
>>  int s_IRQ, n_IRQ;
>> +int notify_eoi = -1;
>>  pr_debug("%s: cpu %d addr %#llx <= 0x%08x\n", __func__, idx,
>>  addr, val);
>> @@ -1087,6 +1088,8 @@ static int openpic_cpu_write_internal(void *opaque, 
>> gpa_t addr,
>>  }
>>  IRQ_resetbit(&dst->servicing, s_IRQ);
>> +/* Notify listeners that the IRQ is over */
>> +notify_eoi = s_IRQ;
>>  /* Set up next servicing IRQ */
>>  s_IRQ = IRQ_get_next(opp, &dst->servicing);
>>  /* Check queued interrupts. */
>> @@ -1104,6 +1107,12 @@ static int openpic_cpu_write_internal(void *opaque, 
>> gpa_t addr,
>>  break;
>>  }
>> +if (notify_eoi != -1) {
>> +spin_unlock_irq(&opp->lock);
>> +kvm_notify_acked_irq(opp->kvm, 0, notify_eoi);
>> +spin_lock_irq(&opp->lock);
>> +}
> 
> I'd rather not have the "_irq" here, which could break if we enter this patch 
> via an "_irqsave" (I realize there currently is no such path that reaches EOI 
> emulation).
> 
> Will we ever set notify_eoi when addr != EOI?  I'm wondering why it was moved 
> out of the switch statement, instead of being put at the end of the case EOI: 
> code.

I doubt it, but that's for the compiler to optimize away. I found it cleaner 
for some reason to put it down there. I don't think it really matters.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 8/8 v3] KVM: PPC: e500: Add e6500 core to Kconfig description

2013-04-25 Thread tiejun.chen

On 04/25/2013 05:32 PM, Caraman Mihai Claudiu-B02008 wrote:

-Original Message-
From: tiejun.chen [mailto:tiejun.c...@windriver.com]
Sent: Thursday, April 25, 2013 12:17 PM
To: Caraman Mihai Claudiu-B02008
Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org
Subject: Re: [PATCH 8/8 v3] KVM: PPC: e500: Add e6500 core to Kconfig
description

On 04/25/2013 05:09 PM, Caraman Mihai Claudiu-B02008 wrote:

-Original Message-
From: tiejun.chen [mailto:tiejun.c...@windriver.com]
Sent: Friday, April 19, 2013 1:03 PM
To: Caraman Mihai Claudiu-B02008
Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org
Subject: Re: [PATCH 8/8 v3] KVM: PPC: e500: Add e6500 core to Kconfig
description

On 04/11/2013 06:03 PM, Mihai Caraman wrote:

Add e6500 core to Kconfig description.

Signed-off-by: Mihai Caraman 
---
v3:
- No change

arch/powerpc/kvm/Kconfig |6 +++---
1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 63c67ec..4489520 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -136,15 +136,15 @@ config KVM_E500V2
  If unsure, say N.

config KVM_E500MC
-   bool "KVM support for PowerPC E500MC/E5500 processors"
+   bool "KVM support for PowerPC E500MC/E5500/E6500 processors"
depends on PPC_E500MC
select KVM
select KVM_MMIO
select KVM_BOOKE_HV
select MMU_NOTIFIER
---help---
- Support running unmodified E500MC/E5500 (32-bit) guest kernels in

I ever tried p5040ds but failed with 64-bit, but looks are you saying
this patch
set can make e5500/e6500 work well with 64-bit? If so, will we need to
upgrade
qemu or something else like dtb?

KVM should work on p5040ds with and without this patchset. The latest
qemu requires this patch: "powerpc: Add paravirt idle loop for 64-bit

Book-E",

you will not pass guest udev without it.

This is a kernel patch required by latest qemu.

Looks this commit is applied only into galak/powerpc.git, next, but still not 
merged into agraf/linux-2.6.git, so I'm confused which tree can support 64bit 
Book3E KVM as you point.

Tiejun
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 1/6] kvm: add device control API

2013-04-25 Thread Gleb Natapov

On Fri, Apr 12, 2013 at 07:08:42PM -0500, Scott Wood wrote:
> Currently, devices that are emulated inside KVM are configured in a
> hardcoded manner based on an assumption that any given architecture
> only has one way to do it.  If there's any need to access device state,
> it is done through inflexible one-purpose-only IOCTLs (e.g.
> KVM_GET/SET_LAPIC).  Defining new IOCTLs for every little thing is
> cumbersome and depletes a limited numberspace.
> 
> This API provides a mechanism to instantiate a device of a certain
> type, returning an ID that can be used to set/get attributes of the
> device.  Attributes may include configuration parameters (e.g.
> register base address), device state, operational commands, etc.  It
> is similar to the ONE_REG API, except that it acts on devices rather
> than vcpus.
> 
> Both device types and individual attributes can be tested without having
> to create the device or get/set the attribute, without the need for
> separately managing enumerated capabilities.
> 
> Signed-off-by: Scott Wood 
> ---
> v4:
>  - Move some boilerplate back into generic code, as requested by Gleb.
>File descriptor management and reference counting is no longer the
>concern of the device implementation.
> 
>  - Don't hold kvm->lock during create.  The original reasons
>for doing so have vanished as for as MPIC is concerned, and
>this avoids needing to answer the question of whether to
>hold the lock during destroy as well.
> 
>Paul, you may need to acquire the lock yourself in kvm_create_xics()
>to protect the -EEXIST check.
> 
> v3: remove some changes that were merged into this patch by accident,
> and fix the error documentation for KVM_CREATE_DEVICE.
> ---
>  Documentation/virtual/kvm/api.txt|   70 
>  Documentation/virtual/kvm/devices/README |1 +
>  include/linux/kvm_host.h |   35 
>  include/uapi/linux/kvm.h |   27 +++
>  virt/kvm/kvm_main.c  |  129 
> ++
>  5 files changed, 262 insertions(+)
>  create mode 100644 Documentation/virtual/kvm/devices/README
> 
> diff --git a/Documentation/virtual/kvm/api.txt 
> b/Documentation/virtual/kvm/api.txt
> index 976eb65..d52f3f9 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -2173,6 +2173,76 @@ header; first `n_valid' valid entries with contents 
> from the data
>  written, then `n_invalid' invalid entries, invalidating any previously
>  valid entries found.
>  
> +4.79 KVM_CREATE_DEVICE
> +
> +Capability: KVM_CAP_DEVICE_CTRL
> +Type: vm ioctl
> +Parameters: struct kvm_create_device (in/out)
> +Returns: 0 on success, -1 on error
> +Errors:
> +  ENODEV: The device type is unknown or unsupported
> +  EEXIST: Device already created, and this type of device may not
> +  be instantiated multiple times
> +
> +  Other error conditions may be defined by individual device types or
> +  have their standard meanings.
> +
> +Creates an emulated device in the kernel.  The file descriptor returned
> +in fd can be used with KVM_SET/GET/HAS_DEVICE_ATTR.
> +
> +If the KVM_CREATE_DEVICE_TEST flag is set, only test whether the
> +device type is supported (not necessarily whether it can be created
> +in the current vm).
> +
> +Individual devices should not define flags.  Attributes should be used
> +for specifying any behavior that is not implied by the device type
> +number.
> +
> +struct kvm_create_device {
> + __u32   type;   /* in: KVM_DEV_TYPE_xxx */
> + __u32   fd; /* out: device handle */
> + __u32   flags;  /* in: KVM_CREATE_DEVICE_xxx */
> +};
Should we add __u32 padding here to make struct size multiple of u64?

> +
> +4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR
> +
> +Capability: KVM_CAP_DEVICE_CTRL
> +Type: device ioctl
> +Parameters: struct kvm_device_attr
> +Returns: 0 on success, -1 on error
> +Errors:
> +  ENXIO:  The group or attribute is unknown/unsupported for this device
> +  EPERM:  The attribute cannot (currently) be accessed this way
> +  (e.g. read-only attribute, or attribute that only makes
> +  sense when the device is in a different state)
> +
> +  Other error conditions may be defined by individual device types.
> +
> +Gets/sets a specified piece of device configuration and/or state.  The
> +semantics are device-specific.  See individual device documentation in
> +the "devices" directory.  As with ONE_REG, the size of the data
> +transferred is defined by the particular attribute.
> +
> +struct kvm_device_attr {
> + __u32   flags;  /* no flags currently defined */
> + __u32   group;  /* device-defined */
> + __u64   attr;   /* group-defined */
> + __u64   addr;   /* userspace address of attr data */
> +};
> +
> +4.81 KVM_HAS_DEVICE_ATTR
> +
> +Capability: KVM_CAP_DEVICE_CTRL
> +Type: device ioctl
> +Parameters: struct kvm_device_attr
> +Returns: 0 on success,

Re: [PATCH 0/7] KVM: irqfd generalization prepare patch set

2013-04-25 Thread Alexander Graf


On 21.04.2013, at 12:51, Michael S. Tsirkin wrote:

> On Tue, Apr 16, 2013 at 07:26:08PM +0200, Alexander Graf wrote:
>> The concept of an irqfd and interrupt routing are nothing particularly tied
>> into the IOAPIC implementation. In fact, most of the code already is 
>> perfectly
>> generic.
>> 
>> This patch set decouples most bits of the existing irqchip and irqfd
>> implementation to make it reusable for non-IOAPIC platforms, like the PPC 
>> MPIC.
>> 
>> I also have a patch that implements working irqfd support on top of these,
>> but that requires the in-kernel MPIC implementation to go upstream first, so
>> I'm holding off on it until we settled everything there, so the concept
>> certainly does work.
>> 
>> Alex
> 
> Nothing to object to here really, this is just
> moving code around.
> And patches 3 and 4 are definitely cleanups.
> Assuming this helps PPC gain in-kernel irqchip support:
> 
> Acked-by: Michael S. Tsirkin 

Could you please check the newer version of this patch set again and give your 
ack if it still holds?

  http://www.mail-archive.com/kvm-ppc@vger.kernel.org/msg06214.html


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: KVM VM(windows xp) reseted when running geekbench for about 2 days

2013-04-25 Thread Zhanghaoyu (A)

>> >> >> On Thu, Apr 18, 2013 at 12:00:49PM +, Zhanghaoyu (A) wrote:
>> >> >>> I start 10 VMs(windows xp), then running geekbench tool on 
>> >> >>> them, about 2 days, one of them was reset, I found the reset 
>> >> >>> operation is done by int kvm_cpu_exec(CPUArchState *env) {
>> >> >>>...
>> >> >>>   switch (run->exit_reason)
>> >> >>>   ...
>> >> >>>case KVM_EXIT_SHUTDOWN:
>> >> >>>DPRINTF("shutdown\n");
>> >> >>>qemu_system_reset_request();
>> >> >>>ret = EXCP_INTERRUPT;
>> >> >>>break;
>> >> >>>...
>> >> >>> }
>> >> >>> 
>> >> >>> KVM_EXIT_SHUTDOWN exit reason was set previously in triple fault 
>> >> >>> handle handle_triple_fault().
>> >> >>> 
>> >> >> How do you know that reset was done here? This is not the only 
>> >> >> place where qemu_system_reset_request() is called.
>> >> I used gdb to debug QEMU process, and add a breakpoint in 
>> >> qemu_system_reset_request(), when the case occurred, backtrace 
>> >> shown as below,
>> >> (gdb) bt
>> >> #0  qemu_system_reset_request () at vl.c:1964
>> >> #1  0x7f9ef9dc5991 in kvm_cpu_exec (env=0x7f9efac47100)
>> >> at /gt/qemu-kvm-1.4/qemu-kvm-1.4/kvm-all.c:1602
>> >> #2  0x7f9ef9d5b229 in qemu_kvm_cpu_thread_fn (arg=0x7f9efac47100)
>> >> at /gt/qemu-kvm-1.4/qemu-kvm-1.4/cpus.c:759
>> >> #3  0x7f9ef898b5f0 in start_thread () from 
>> >> /lib64/libpthread.so.0
>> >> #4  0x7f9ef86fa84d in clone () from /lib64/libc.so.6
>> >> #5  0x in ?? ()
>> >> 
>> >> And, I add printk log in all places where KVM_EXIT_SHUTDOWN exit reason 
>> >> is set, only handle_triple_fault() was called.
>> >> >
>> >> >Make sure XP is not set to auto-reset in case of BSOD. 
>> >> No, winxp is not set to auto-reset in case of BSOD. No Winxp event log 
>> >> reported.
>> >> >
>> >> >Best regards,
>> >> >Yan.
>> >> >
>> >> >> 
>> >> >>> What causes the triple fault?
>> >> >>> 
>> >> >> Are you asking what is triple fault or why it happened in your case?
>> >> What I asked is why triple fault happened in my case.
>> >> >> For the former see here: 
>> >> >> http://en.wikipedia.org/wiki/Triple_fault
>> >> >> For the later it is to late to tell after VM reset. You can run 
>> >> >> QEMU with -no-reboot -no-shutdown. VM will pause instead of 
>> >> >> rebooting and then you can examine what is going on.
>> >> Great thanks, I'll run QEMU with -no-reboot -no-shutdown options, if VM 
>> >> paused in my case, what should I examined?
>> >> 
>> >Register state "info registers" in the monitor for each vcpu. Code around 
>> >the instruction that faulted.
>> 
>> I ran the QEMU with -no-reboot -no-shutdown options, the VM paused 
>> When the case happened, then I info registers in QEMU monitor, shown as 
>> below, CS =0008   00c09b00 DPL =0 CS32 [-RA]
>> SS =0010   00c09300 DPL =0 DS   [-WA]
>> DS =0023   00c0f300 DPL =3 DS   [-WA]
>> FS =0030 ffdff000 1fff 00c09300 DPL =0 DS   [-WA]
>> GS =   00c0
>> LDT=   00c0
>> TR =0028 80042000 20ab 8b00 DPL=0 TSS32-busy
>> GDT= 8003f000 03ff
>> IDT= 8003f400 07ff
>> CR0=8001003b CR2=760d7fe4 CR3=002ec000 CR4=06f8 
>> DR0= DR1= DR2= 
>> DR3= DR6=0ff0 DR7=0400 
>> EFER=0800 FCW=027f FSW= [ST=0] FTW=00 MXCSR=1f80 
>> FPR0=  FPR1=  
>> FPR2=  FPR3=  
>> FPR4=  FPR5=  
>> FPR6=  FPR7=  
>> XMM00= 
>> XMM01=
>> XMM02= 
>> XMM03=
>> XMM04= 
>> XMM05=
>> XMM06= 
>> XMM07=
>> 
>> In normal case, info registers in QEMU monitor, shown as below CS 
>> =001b   00c0fb00 DPL=3 CS32 [-RA]
>> SS =0023   00c0f300 DPL=3 DS   [-WA]
>> DS =0023   00c0f300 DPL=3 DS   [-WA]
>> FS =0038 7ffda000 0fff 0040f300 DPL=3 DS   [-WA]
>> GS =   0100
>> LDT=   
>> TR =0028 80042000 20ab 8b00 DPL=0 TSS32-busy
>> GDT= 8003f000 03ff
>> IDT= 8003f400 07ff
>> CR0=80010031 CR2=0167fd20 CR3=0af00220 CR4=06f8 
>> DR0= DR1= DR2= 
>> DR3= DR6=0ff0 DR7=0400 
>> EFER=0800 FCW=027f FSW= [ST=0] FTW=00 MXCSR=1f80
>> FPR0=00a400a40a18 d830 FPR1=0012f9c07c90e900 e900 
>> FPR2=7c910202 5d40 FPR3=01e27c903400 f808 
>> FPR4=05230012f87a  FPR5=7c905d40 0001 
>> FPR6=0001  FPR7=a

Re: [PATCH 0/7] KVM: irqfd generalization prepare patch set

2013-04-25 Thread Alexander Graf


On 25.04.2013, at 09:28, Gleb Natapov wrote:

> On Wed, Apr 24, 2013 at 01:20:31PM +0300, Gleb Natapov wrote:
>> On Tue, Apr 16, 2013 at 07:26:08PM +0200, Alexander Graf wrote:
>>> The concept of an irqfd and interrupt routing are nothing particularly tied
>>> into the IOAPIC implementation. In fact, most of the code already is 
>>> perfectly
>>> generic.
>>> 
>>> This patch set decouples most bits of the existing irqchip and irqfd
>>> implementation to make it reusable for non-IOAPIC platforms, like the PPC 
>>> MPIC.
>>> 
>>> I also have a patch that implements working irqfd support on top of these,
>>> but that requires the in-kernel MPIC implementation to go upstream first, so
>>> I'm holding off on it until we settled everything there, so the concept
>>> certainly does work.
>>> 
>>> Alex
>>> 
>> Nice cleanup, thanks! Should expect a new series with "ifdef
>> kvm_irqchip" and ia64 compilation fixed. The fixes are minor enough for
>> me to fix them while applying.
>> 
> Actually the series does not apply any more and has to be rebased on top of 
> the
> current queue.

Heh, we're already at v3:

  http://www.mail-archive.com/kvm-ppc@vger.kernel.org/msg06214.html


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH untested] vhost: allow device specific fields per vq

2013-04-25 Thread Michael S. Tsirkin

Off-list, Asias asked about adding scsi specific fields per vq.
Something like the following would be helpful: untested, just to give
you the idea.

On top of this we can add patches to move things like ubufs
from vhost.h out to net.c

Warning: completely untested.

Signed-off-by: Michael S. Tsirkin 

---

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index ec6fb3f..e8fa9b6 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -70,9 +70,13 @@ enum vhost_net_poll_state {
VHOST_NET_POLL_STOPPED = 2,
 };
 
+struct vhost_net_virtqueue {
+   struct vhost_virtqueue vq;
+};
+
 struct vhost_net {
struct vhost_dev dev;
-   struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
+   struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX];
struct vhost_poll poll[VHOST_NET_VQ_MAX];
/* Tells us whether we are polling a socket for TX.
 * We only do this when socket buffer fills up.
@@ -612,17 +616,26 @@ static int vhost_net_open(struct inode *inode, struct 
file *f)
 {
struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
struct vhost_dev *dev;
+   struct vhost_virtqueue **vqs;
int r;
 
if (!n)
return -ENOMEM;
+   vqs = kmalloc(VHOST_NET_VQ_MAX, sizeof *vqs);
+   if (!vqs) {
+   kfree(n);
+   return -ENOMEM;
+   }
 
dev = &n->dev;
-   n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
-   n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
-   r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX);
+   vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq;
+   vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq;
+   n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick;
+   n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick;
+   r = vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX);
if (r < 0) {
kfree(n);
+   kfree(vqs);
return r;
}
 
@@ -727,6 +740,7 @@ static int vhost_net_release(struct inode *inode, struct 
file *f)
/* We do an extra flush before freeing memory,
 * since jobs can re-queue themselves. */
vhost_net_flush(n);
+   kfree(n->dev->vqs);
kfree(n);
return 0;
 }
diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
index 2968b49..ba54b3c 100644
--- a/drivers/vhost/tcm_vhost.c
+++ b/drivers/vhost/tcm_vhost.c
@@ -72,6 +72,10 @@ enum {
 #define VHOST_SCSI_MAX_TARGET  256
 #define VHOST_SCSI_MAX_VQ  128
 
+struct vhost_scsi_virtqueue {
+   struct vhost_virtqueue vq;
+};
+
 struct vhost_scsi {
/* Protected by vhost_scsi->dev.mutex */
struct tcm_vhost_tpg *vs_tpg[VHOST_SCSI_MAX_TARGET];
@@ -79,7 +83,7 @@ struct vhost_scsi {
bool vs_endpoint;
 
struct vhost_dev dev;
-   struct vhost_virtqueue vqs[VHOST_SCSI_MAX_VQ];
+   struct vhost_scsi_virtqueue vqs[VHOST_SCSI_MAX_VQ];
 
struct vhost_work vs_completion_work; /* cmd completion work item */
struct llist_head vs_completion_list; /* cmd completion queue */
@@ -902,20 +906,32 @@ err_dev:
 static int vhost_scsi_open(struct inode *inode, struct file *f)
 {
struct vhost_scsi *s;
+   struct vhost_scsi_virtqueue *vqs;
int r, i;
 
s = kzalloc(sizeof(*s), GFP_KERNEL);
if (!s)
return -ENOMEM;
 
+   vqs = kmalloc(VHOST_SCSI_MAX_VQ, sizeof *vqs);
+   if (!vqs) {
+   kfree(s);
+   return -ENOMEM;
+   }
+
vhost_work_init(&s->vs_completion_work, vhost_scsi_complete_cmd_work);
 
-   s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
-   s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
-   for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
-   s->vqs[i].handle_kick = vhost_scsi_handle_kick;
-   r = vhost_dev_init(&s->dev, s->vqs, VHOST_SCSI_MAX_VQ);
+   vqs[VHOST_SCSI_VQ_CTL] = &n->vqs[VHOST_SCSI_VQ_CTL].vq;
+   vqs[VHOST_SCSI_VQ_EVT] = &n->vqs[VHOST_SCSI_VQ_EVT].vq;
+   s->vqs[VHOST_SCSI_VQ_CTL].vq.handle_kick = vhost_scsi_ctl_handle_kick;
+   s->vqs[VHOST_SCSI_VQ_EVT].vq.handle_kick = vhost_scsi_evt_handle_kick;
+   for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++) {
+   vqs[i] = &s->vqs[i].vq;
+   s->vqs[i].vq.handle_kick = vhost_scsi_handle_kick;
+   }
+   r = vhost_dev_init(&s->dev, vqs, VHOST_SCSI_MAX_VQ);
if (r < 0) {
+   kfree(vqs);
kfree(s);
return r;
}
@@ -935,6 +951,7 @@ static int vhost_scsi_release(struct inode *inode, struct 
file *f)
vhost_scsi_clear_endpoint(s, &t);
vhost_dev_stop(&s->dev);
vhost_dev_cleanup(&s->dev, false);
+   kfree(s->dev->vqs);
kfree(s);
return 0;
 }
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 9759249..666ed34 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers

Re: [PATCH v2 3/6] KVM: MMU: make return value of mmio page fault handler more readable

2013-04-25 Thread Xiao Guangrong

On 04/24/2013 09:34 PM, Gleb Natapov wrote:

>> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
>> index 2adcbc2..6b4ba1e 100644
>> --- a/arch/x86/kvm/mmu.h
>> +++ b/arch/x86/kvm/mmu.h
>> @@ -52,6 +52,20 @@
>>  
>>  int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 
>> sptes[4]);
>>  void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
>> +
>> +/*
>> + * Return values of handle_mmio_page_fault_common:
>> + * RET_MMIO_PF_EMU: it is a real mmio page fault, emulate the instruction
>> + *  directly.
>> + * RET_MMIO_PF_RETRY: let CPU fault again on the address.
>> + * RET_MMIO_PF_BUG: bug is detected.
>> + */
>> +enum {
>> +RET_MMIO_PF_EMU = 1,
> Make it RET_MMIO_PF_EMULATE please.

Good to me, will do.

Thanks!

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 6/6] KVM: MMU: init kvm generation close to mmio wrap-around value

2013-04-25 Thread Xiao Guangrong

On 04/24/2013 08:59 PM, Gleb Natapov wrote:
> On Mon, Apr 01, 2013 at 05:56:49PM +0800, Xiao Guangrong wrote:
>> Then it has chance to trigger mmio generation number wrap-around
>>
>> Signed-off-by: Xiao Guangrong 
>> ---
>>  arch/x86/include/asm/kvm_host.h |1 +
>>  arch/x86/kvm/mmu.c  |8 
>>  virt/kvm/kvm_main.c |6 ++
>>  3 files changed, 15 insertions(+), 0 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h 
>> b/arch/x86/include/asm/kvm_host.h
>> index 6c1e642..4e1f7cb 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -767,6 +767,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
>>   struct kvm_memory_slot *slot,
>>   gfn_t gfn_offset, unsigned long mask);
>>  void kvm_mmu_zap_all(struct kvm *kvm);
>> +void kvm_arch_init_generation(struct kvm *kvm);
>>  void kvm_mmu_invalid_mmio_sptes(struct kvm *kvm);
>>  unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
>>  void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int 
>> kvm_nr_mmu_pages);
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index d314e21..dcc059c 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -4279,6 +4279,14 @@ restart:
>>  spin_unlock(&kvm->mmu_lock);
>>  }
>>  
>> +void kvm_arch_init_generation(struct kvm *kvm)
>> +{
>> +mutex_lock(&kvm->slots_lock);
>> +/* It is easier to trigger mmio generation-number wrap-around. */
>> +kvm_memslots(kvm)->generation = MMIO_MAX_GEN - 13;
> kvm_memslots(kvm)->generation should never overflow since
> (read|write)_cached mechanism does not handle it. Initialising it to
> anything but 0 makes overflow more likely.
> 
> You can hide mmio overflow trick in kvm_current_mmio_generation():
> 
> static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
> {
>   return (kvm_memslots(kvm)->generation + MMIO_MAX_GEN - 13) & 
> MMIO_GEN_MASK;
> }

Very smart idea. Thanks you, Gleb!



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Bug 53611] New: nVMX: Add nested EPT

2013-04-25 Thread Gleb Natapov

On Thu, Apr 25, 2013 at 01:00:42AM -0700, Nakajima, Jun wrote:
> On Wed, Apr 24, 2013 at 8:55 AM, Nakajima, Jun  wrote:
> > Sorry about the slow progress. We've been distracted by some priority
> > things. The patches are ready (i.e. working), but we are cleaning them
> > up. I'll send what we have today.
> 
> So, I have sent them, and frankly we are still cleaning up.  Please
> bear with us.
> We are also sending one more patchset to deal with EPT
> misconfiguration, but Linux should run in L2 on top of L1 KVM.
> 
The patches are mangled and unreadable. Please resend using "git
send-email".

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 8/8 v3] KVM: PPC: e500: Add e6500 core to Kconfig description

2013-04-25 Thread tiejun.chen

On 04/25/2013 05:09 PM, Caraman Mihai Claudiu-B02008 wrote:

-Original Message-
From: tiejun.chen [mailto:tiejun.c...@windriver.com]
Sent: Friday, April 19, 2013 1:03 PM
To: Caraman Mihai Claudiu-B02008
Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org
Subject: Re: [PATCH 8/8 v3] KVM: PPC: e500: Add e6500 core to Kconfig
description

On 04/11/2013 06:03 PM, Mihai Caraman wrote:

Add e6500 core to Kconfig description.

Signed-off-by: Mihai Caraman 
---
v3:
   - No change

   arch/powerpc/kvm/Kconfig |6 +++---
   1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 63c67ec..4489520 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -136,15 +136,15 @@ config KVM_E500V2
  If unsure, say N.

   config KVM_E500MC
-   bool "KVM support for PowerPC E500MC/E5500 processors"
+   bool "KVM support for PowerPC E500MC/E5500/E6500 processors"
depends on PPC_E500MC
select KVM
select KVM_MMIO
select KVM_BOOKE_HV
select MMU_NOTIFIER
---help---
- Support running unmodified E500MC/E5500 (32-bit) guest kernels in

I ever tried p5040ds but failed with 64-bit, but looks are you saying
this patch
set can make e5500/e6500 work well with 64-bit? If so, will we need to
upgrade
qemu or something else like dtb?

KVM should work on p5040ds with and without this patchset. The latest
qemu requires this patch: "powerpc: Add paravirt idle loop for 64-bit Book-E",
you will not pass guest udev without it.

Which should qemu tree be used here?

My tree is cloned from:

git://repo.or.cz/qemu/agraf.git ppc-next

But I can't find this commit.

Tiejun

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [PATCH 8/8 v3] KVM: PPC: e500: Add e6500 core to Kconfig description

2013-04-25 Thread Caraman Mihai Claudiu-B02008

> -Original Message-
> From: tiejun.chen [mailto:tiejun.c...@windriver.com]
> Sent: Friday, April 19, 2013 1:03 PM
> To: Caraman Mihai Claudiu-B02008
> Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org
> Subject: Re: [PATCH 8/8 v3] KVM: PPC: e500: Add e6500 core to Kconfig
> description
> 
> On 04/11/2013 06:03 PM, Mihai Caraman wrote:
> > Add e6500 core to Kconfig description.
> >
> > Signed-off-by: Mihai Caraman 
> > ---
> > v3:
> >   - No change
> >
> >   arch/powerpc/kvm/Kconfig |6 +++---
> >   1 files changed, 3 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
> > index 63c67ec..4489520 100644
> > --- a/arch/powerpc/kvm/Kconfig
> > +++ b/arch/powerpc/kvm/Kconfig
> > @@ -136,15 +136,15 @@ config KVM_E500V2
> >   If unsure, say N.
> >
> >   config KVM_E500MC
> > -   bool "KVM support for PowerPC E500MC/E5500 processors"
> > +   bool "KVM support for PowerPC E500MC/E5500/E6500 processors"
> > depends on PPC_E500MC
> > select KVM
> > select KVM_MMIO
> > select KVM_BOOKE_HV
> > select MMU_NOTIFIER
> > ---help---
> > - Support running unmodified E500MC/E5500 (32-bit) guest kernels in
> 
> I ever tried p5040ds but failed with 64-bit, but looks are you saying
> this patch
> set can make e5500/e6500 work well with 64-bit? If so, will we need to
> upgrade
> qemu or something else like dtb?

KVM should work on p5040ds with and without this patchset. The latest 
qemu requires this patch: "powerpc: Add paravirt idle loop for 64-bit Book-E",
you will not pass guest udev without it.
Please details what fails on p5040ds.

-Mike


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [PATCH] kvm/powerpc/e500mc: fix tlb invalidation on cpu migration

2013-04-25 Thread Caraman Mihai Claudiu-B02008

> On 08.03.2013, at 21:25, Scott Wood wrote:
> 
> > The existing check handles the case where we've migrated to a different
> > core than we last ran on, but it doesn't handle the case where we're
> > still on the same cpu we last ran on, but some other vcpu has run on
> > this cpu in the meantime.
> >
> > Without this, guest segfaults (and other misbehavior) have been seen in
> > smp guests.
> >
> > Cc: sta...@vger.kernel.org # 3.8.x
> > Signed-off-by: Scott Wood 
> 
> Thanks, applied to kvm-ppc-3.9.
> 
> 
> Alex

Can you pull it into kvm-ppc-queue?

Thanks,
Mike

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v11 0/4] tcm_vhost hotplug

2013-04-25 Thread Nicholas A. Bellinger

On Thu, 2013-04-25 at 10:39 +0300, Michael S. Tsirkin wrote:
> On Thu, Apr 25, 2013 at 03:35:19PM +0800, Asias He wrote:
> > Changes in v11
> > - Drop change log histroy in commit log
> > 
> > Changes in v10
> > - Drop comments about lun
> > - Add Enable VIRTIO_SCSI_F_HOTPLUG to this series
> > 
> > Changes in v9
> > - Drop tcm_vhost_check_feature
> > - Add Refactor the lock nesting rule to this sereis
> > 
> > Asias He (4):
> >   tcm_vhost: Refactor the lock nesting rule
> >   tcm_vhost: Add hotplug/hotunplug support
> >   tcm_vhost: Add ioctl to get and set events missed flag
> >   tcm_vhost: Enable VIRTIO_SCSI_F_HOTPLUG
> > 
> >  drivers/vhost/tcm_vhost.c | 262 
> > +++---
> >  drivers/vhost/tcm_vhost.h |  13 +++
> >  2 files changed, 259 insertions(+), 16 deletions(-)
> 
> 
> Acked-by: Michael S. Tsirkin 
> 

Applied to target-pending/for-next.

Nice work Asias & MST !

--nab


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 01/12] Subject: [PATCH 01/10] nEPT: Support LOAD_IA32_EFER entry/exit controls for L1

2013-04-25 Thread Gleb Natapov

All the patches are mangled by your email client. Please use "git
send-email --thread" to send them.

On Thu, Apr 25, 2013 at 12:50:19AM -0700, Nakajima, Jun wrote:
> Recent KVM, since http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577
> switch the EFER MSR when EPT is used and the host and guest have different
> NX bits. So if we add support for nested EPT (L1 guest using EPT to run L2)
> and want to be able to run recent KVM as L1, we need to allow L1 to use this
> EFER switching feature.
> 
> To do this EFER switching, KVM uses VM_ENTRY/EXIT_LOAD_IA32_EFER if available,
> and if it isn't, it uses the generic VM_ENTRY/EXIT_MSR_LOAD. This patch adds
> support for the former (the latter is still unsupported).
> 
> Nested entry and exit emulation (prepare_vmcs_02 and load_vmcs12_host_state,
> respectively) already handled VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So all
> that's left to do in this patch is to properly advertise this feature to L1.
> 
> Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are emulated by L0, by using
> vmx_set_efer (which itself sets one of several vmcs02 fields), so we always
> support this feature, regardless of whether the host supports it.
> 
> Signed-off-by: Nadav Har'El 
> Signed-off-by: Jun Nakajima 
> 
> modified:   arch/x86/kvm/vmx.c
> ---
>  arch/x86/kvm/vmx.c | 18 ++
>  1 file changed, 14 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 6667042..9e0ec9d 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -2057,6 +2057,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
>  #else
>   nested_vmx_exit_ctls_high = 0;
>  #endif
> + nested_vmx_exit_ctls_high |= VM_EXIT_LOAD_IA32_EFER;
> 
>   /* entry controls */
>   rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
> @@ -2064,6 +2065,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
>   nested_vmx_entry_ctls_low = 0;
>   nested_vmx_entry_ctls_high &=
>   VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
> + nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_IA32_EFER;
> 
>   /* cpu-based controls */
>   rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
> @@ -7050,10 +7052,18 @@ static void prepare_vmcs02(struct kvm_vcpu
> *vcpu, struct vmcs12 *vmcs12)
>   vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
>   vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
> 
> - /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
> - vmcs_write32(VM_EXIT_CONTROLS,
> - vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
> - vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
> + /* L2->L1 exit controls are emulated - the hardware exit is to L0 so
> + * we should use its exit controls. Note that IA32_MODE, LOAD_IA32_EFER
> + * bits are further modified by vmx_set_efer() below.
> + */
> + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
> +
> + /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
> + * emulated by vmx_set_efer(), below.
> + */
> + vmcs_write32(VM_ENTRY_CONTROLS,
> + (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
> + ~VM_ENTRY_IA32E_MODE) |
>   (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
> 
>   if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
> --
> 1.8.2.1.610.g562af5b
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Bug 53611] New: nVMX: Add nested EPT

2013-04-25 Thread Nakajima, Jun

On Wed, Apr 24, 2013 at 8:55 AM, Nakajima, Jun  wrote:
> Sorry about the slow progress. We've been distracted by some priority
> things. The patches are ready (i.e. working), but we are cleaning them
> up. I'll send what we have today.

So, I have sent them, and frankly we are still cleaning up.  Please
bear with us.
We are also sending one more patchset to deal with EPT
misconfiguration, but Linux should run in L2 on top of L1 KVM.

--
Jun
Intel Open Source Technology Center
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 11/12] Move the routines to paging_tmpl.h to make them diffrent for virtual EPT.

2013-04-25 Thread Nakajima, Jun

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 

modified:   arch/x86/kvm/mmu.c
---
 arch/x86/kvm/mmu.c | 30 --
 1 file changed, 30 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 34e406e2..99bfc5e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2480,26 +2480,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct
kvm_vcpu *vcpu, gfn_t gfn,
  return gfn_to_pfn_memslot_atomic(slot, gfn);
 }

-static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
-  struct kvm_mmu_page *sp, u64 *spte,
-  u64 gpte)
-{
- if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
- goto no_present;
-
- if (!is_present_gpte(gpte))
- goto no_present;
-
- if (!(gpte & PT_ACCESSED_MASK))
- goto no_present;
-
- return false;
-
-no_present:
- drop_spte(vcpu->kvm, spte);
- return true;
-}
-
 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
 struct kvm_mmu_page *sp,
 u64 *start, u64 *end)
@@ -3399,16 +3379,6 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t
gfn, unsigned access,
  return false;
 }

-static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
-{
- unsigned access;
-
- access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
- access &= ~(gpte >> PT64_NX_SHIFT);
-
- return access;
-}
-
 static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level,
unsigned gpte)
 {
  unsigned index;
--
1.8.2.1.610.g562af5b
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 12/12] Provide the correct exit qualification upon EPT violation to L1 VMM.

2013-04-25 Thread Nakajima, Jun

Since vcpu_vmx is contained in vmx.c, use kvm_vcpu_arch so that we can
use the exit quaflication in paging_tmpl.h.

Signed-off-by: Jun Nakajima 

modified:   arch/x86/include/asm/kvm_host.h
modified:   arch/x86/kvm/paging_tmpl.h
modified:   arch/x86/kvm/vmx.c
---
 arch/x86/include/asm/kvm_host.h | 2 ++
 arch/x86/kvm/paging_tmpl.h  | 4 
 arch/x86/kvm/vmx.c  | 3 +++
 3 files changed, 9 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4979778..5d1fdf2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -504,6 +504,8 @@ struct kvm_vcpu_arch {
  * instruction.
  */
  bool write_fault_to_shadow_pgtable;
+
+ unsigned long exit_qualification;
 };

 struct kvm_lpage_info {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6226b51..0da6044 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -349,7 +349,11 @@ error:

  walker->fault.vector = PF_VECTOR;
  walker->fault.error_code_valid = true;
+#if PTTYPE != PTTYPE_EPT
  walker->fault.error_code = errcode;
+#else
+ walker->fault.error_code = vcpu->arch.exit_qualification & 0x7; /*
exit_qualificaiton */
+#endif
  walker->fault.address = addr;
  walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 95304cc..61e2853 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -425,6 +425,7 @@ struct vcpu_vmx {
  ktime_t entry_time;
  s64 vnmi_blocked_time;
  u32 exit_reason;
+ unsigned long exit_qualification;

  bool rdtscp_enabled;

@@ -5074,6 +5075,8 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
  /* ept page table is present? */
  error_code |= (exit_qualification >> 3) & 0x1;

+vcpu->arch.exit_qualification = exit_qualification;
+
  return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }

--
1.8.2.1.610.g562af5b
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 10/12] Subject: [PATCH 10/10] nEPT: Miscelleneous cleanups

2013-04-25 Thread Nakajima, Jun

Some trivial code cleanups not really related to nested EPT.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 

modified:   arch/x86/include/asm/vmx.h
modified:   arch/x86/kvm/vmx.c
---
 arch/x86/include/asm/vmx.h | 44 
 arch/x86/kvm/vmx.c |  3 +--
 2 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 0ce54f3..5838be1 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -254,6 +254,50 @@ enum vmcs_field {
  HOST_RIP= 0x6c16,
 };

+#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x8000
+
+#define EXIT_REASON_EXCEPTION_NMI   0
+#define EXIT_REASON_EXTERNAL_INTERRUPT  1
+#define EXIT_REASON_TRIPLE_FAULT2
+
+#define EXIT_REASON_PENDING_INTERRUPT   7
+#define EXIT_REASON_NMI_WINDOW 8
+#define EXIT_REASON_TASK_SWITCH 9
+#define EXIT_REASON_CPUID   10
+#define EXIT_REASON_HLT 12
+#define EXIT_REASON_INVD13
+#define EXIT_REASON_INVLPG  14
+#define EXIT_REASON_RDPMC   15
+#define EXIT_REASON_RDTSC   16
+#define EXIT_REASON_VMCALL  18
+#define EXIT_REASON_VMCLEAR 19
+#define EXIT_REASON_VMLAUNCH20
+#define EXIT_REASON_VMPTRLD 21
+#define EXIT_REASON_VMPTRST 22
+#define EXIT_REASON_VMREAD  23
+#define EXIT_REASON_VMRESUME24
+#define EXIT_REASON_VMWRITE 25
+#define EXIT_REASON_VMOFF   26
+#define EXIT_REASON_VMON27
+#define EXIT_REASON_CR_ACCESS   28
+#define EXIT_REASON_DR_ACCESS   29
+#define EXIT_REASON_IO_INSTRUCTION  30
+#define EXIT_REASON_MSR_READ31
+#define EXIT_REASON_MSR_WRITE   32
+#define EXIT_REASON_INVALID_STATE 33
+#define EXIT_REASON_MWAIT_INSTRUCTION   36
+#define EXIT_REASON_MONITOR_INSTRUCTION 39
+#define EXIT_REASON_PAUSE_INSTRUCTION   40
+#define EXIT_REASON_MCE_DURING_VMENTRY 41
+#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
+#define EXIT_REASON_APIC_ACCESS 44
+#define EXIT_REASON_EPT_VIOLATION   48
+#define EXIT_REASON_EPT_MISCONFIG   49
+#define EXIT_REASON_INVEPT 50
+#define EXIT_REASON_WBINVD 54
+#define EXIT_REASON_XSETBV 55
+#define EXIT_REASON_INVPCID 58
+
 /*
  * Interruption-information format
  */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 10f2a69..95304cc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -616,7 +616,6 @@ static void nested_release_page_clean(struct page *page)
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
 struct kvm_segment *var, int seg);
@@ -6320,7 +6319,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)

  if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
 !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
-get_vmcs12(vcpu), vcpu {
+ get_vmcs12(vcpu) {
  if (vmx_interrupt_allowed(vcpu)) {
  vmx->soft_vnmi_blocked = 0;
  } else if (vmx->vnmi_blocked_time > 10LL &&
--
1.8.2.1.610.g562af5b
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 09/12] Subject: [PATCH 09/10] nEPT: Documentation

2013-04-25 Thread Nakajima, Jun

Update the documentation to no longer say that nested EPT is not supported.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 

modified:   Documentation/virtual/kvm/nested-vmx.txt
---
 Documentation/virtual/kvm/nested-vmx.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/virtual/kvm/nested-vmx.txt
b/Documentation/virtual/kvm/nested-vmx.txt
index 8ed937d..cdf7839 100644
--- a/Documentation/virtual/kvm/nested-vmx.txt
+++ b/Documentation/virtual/kvm/nested-vmx.txt
@@ -38,8 +38,8 @@ The current code supports running Linux guests under
KVM guests.
 Only 64-bit guest hypervisors are supported.

 Additional patches for running Windows under guest KVM, and Linux under
-guest VMware server, and support for nested EPT, are currently running in
-the lab, and will be sent as follow-on patchsets.
+guest VMware server, are currently running in the lab, and will be sent as
+follow-on patchsets.


 Running nested VMX
--
1.8.2.1.610.g562af5b
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 08/12] Subject: [PATCH 08/10] nEPT: Nested INVEPT

2013-04-25 Thread Nakajima, Jun

If we let L1 use EPT, we should probably also support the INVEPT instruction.

In our current nested EPT implementation, when L1 changes its EPT table for
L2 (i.e., EPT12), L0 modifies the shadow EPT table (EPT02), and in the course
of this modification already calls INVEPT. Therefore, when L1 calls INVEPT,
we don't really need to do anything. In particular we *don't* need to call
the real INVEPT again. All we do in our INVEPT is verify the validity of the
call, and its parameters, and then do nothing.

In KVM Forum 2010, Dong et al. presented "Nested Virtualization Friendly KVM"
and classified our current nested EPT implementation as "shadow-like virtual
EPT". He recommended instead a different approach, which he called "VTLB-like
virtual EPT". If we had taken that alternative approach, INVEPT would have had
a bigger role: L0 would only rebuild the shadow EPT table when L1 calls INVEPT.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 

modified:   arch/x86/include/asm/vmx.h
modified:   arch/x86/kvm/vmx.c
---
 arch/x86/include/asm/vmx.h |  4 ++-
 arch/x86/kvm/vmx.c | 83 ++
 2 files changed, 86 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index b6fbf86..0ce54f3 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -376,7 +376,9 @@ enum vmcs_field {
 #define VMX_EPTP_WB_BIT (1ull << 14)
 #define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
 #define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
-#define VMX_EPT_AD_BIT(1ull << 21)
+#define VMX_EPT_INVEPT_BIT (1ull << 20)
+#define VMX_EPT_AD_BIT (1ull << 21)
+#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
 #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a5e14d1..10f2a69 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5878,6 +5878,87 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
  return 1;
 }

+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+ u32 vmx_instruction_info;
+ unsigned long type;
+ gva_t gva;
+ struct x86_exception e;
+ struct {
+ u64 eptp, gpa;
+ } operand;
+
+ if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
+!(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ /* According to the Intel VMX instruction reference, the memory
+ * operand is read even if it isn't needed (e.g., for type==global)
+ */
+ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ vmx_instruction_info, &gva))
+ return 1;
+ if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
+ sizeof(operand), &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+
+ type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+ switch (type) {
+ case VMX_EPT_EXTENT_GLOBAL:
+ if (!(nested_vmx_ept_caps & VMX_EPT_EXTENT_GLOBAL_BIT))
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ else {
+ /*
+ * Do nothing: when L1 changes EPT12, we already
+ * update EPT02 (the shadow EPT table) and call INVEPT.
+ * So when L1 calls INVEPT, there's nothing left to do.
+ */
+ nested_vmx_succeed(vcpu);
+ }
+ break;
+ case VMX_EPT_EXTENT_CONTEXT:
+ if (!(nested_vmx_ept_caps & VMX_EPT_EXTENT_CONTEXT_BIT))
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ else {
+ /* Do nothing */
+ nested_vmx_succeed(vcpu);
+ }
+ break;
+ case VMX_EPT_EXTENT_INDIVIDUAL_ADDR:
+ if (!(nested_vmx_ept_caps & VMX_EPT_EXTENT_INDIVIDUAL_BIT))
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ else {
+ /* Do nothing */
+ nested_vmx_succeed(vcpu);
+ }
+ break;
+ default:
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ }
+
+ skip_emulated_instruction(vcpu);
+ return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -5922,6 +6003,7 @@ static int (*const
kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
  [EXIT_REASON_PAUSE_INSTRUCTION]   = handle_pause,
  [EXIT_REASON_MWAIT_INSTRUCTION]  = handle_invalid_op,
  [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,
+ [EXIT_REASON_INVEPT]  = handle_invept,
 };

 static const int kvm_vmx_max_exit_handlers =
@@ -6106,6 +6188,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
  case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
  case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
  case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+ case EXIT_REASON_INVEPT:
  /*
  * VMX instructions trap unconditionally. This allows

[PATCH 07/12] Subject: [PATCH 07/10] nEPT: Advertise EPT to L1

2013-04-25 Thread Nakajima, Jun

Advertise the support of EPT to the L1 guest, through the appropriate MSR.

This is the last patch of the basic Nested EPT feature, so as to allow
bisection through this patch series: The guest will not see EPT support until
this last patch, and will not attempt to use the half-applied feature.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 

modified:   arch/x86/kvm/vmx.c
---
 arch/x86/kvm/vmx.c | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0e99b15..a5e14d1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2026,6 +2026,7 @@ static u32 nested_vmx_secondary_ctls_low,
nested_vmx_secondary_ctls_high;
 static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
+static u32 nested_vmx_ept_caps;
 static __init void nested_vmx_setup_ctls_msrs(void)
 {
  /*
@@ -2101,6 +2102,18 @@ static __init void nested_vmx_setup_ctls_msrs(void)
  nested_vmx_secondary_ctls_low = 0;
  nested_vmx_secondary_ctls_high &=
  SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ if (enable_ept) {
+ /* nested EPT: emulate EPT also to L1 */
+ nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
+ nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT;
+ nested_vmx_ept_caps |=
+ VMX_EPT_INVEPT_BIT | VMX_EPT_EXTENT_GLOBAL_BIT |
+ VMX_EPT_EXTENT_CONTEXT_BIT |
+ VMX_EPT_EXTENT_INDIVIDUAL_BIT;
+ nested_vmx_ept_caps &= vmx_capability.ept;
+ } else
+ nested_vmx_ept_caps = 0;
+
 }

 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
@@ -2200,8 +2213,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu
*vcpu, u32 msr_index, u64 *pdata)
  nested_vmx_secondary_ctls_high);
  break;
  case MSR_IA32_VMX_EPT_VPID_CAP:
- /* Currently, no nested ept or nested vpid */
- *pdata = 0;
+ /* Currently, no nested vpid support */
+ *pdata = nested_vmx_ept_caps;
  break;
  default:
  return 0;
--
1.8.2.1.610.g562af5b
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 06/12] Subject: [PATCH 06/10] nEPT: Some additional comments

2013-04-25 Thread Nakajima, Jun

Some additional comments to preexisting code:
Explain who (L0 or L1) handles EPT violation and misconfiguration exits.
Don't mention "shadow on either EPT or shadow" as the only two options.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 

modified:   arch/x86/kvm/vmx.c
---
 arch/x86/kvm/vmx.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d4bfd32..0e99b15 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6126,7 +6126,20 @@ static bool nested_vmx_exit_handled(struct
kvm_vcpu *vcpu)
  return nested_cpu_has2(vmcs12,
  SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
  case EXIT_REASON_EPT_VIOLATION:
+ /*
+ * L0 always deals with the EPT violation. If nested EPT is
+ * used, and the nested mmu code discovers that the address is
+ * missing in the guest EPT table (EPT12), the EPT violation
+ * will be injected with nested_ept_inject_page_fault()
+ */
+ return 0;
  case EXIT_REASON_EPT_MISCONFIG:
+ /*
+ * L2 never uses directly L1's EPT, but rather L0's own EPT
+ * table (shadow on EPT) or a merged EPT table that L0 built
+ * (EPT on EPT). So any problems with the structure of the
+ * table is L0's fault.
+ */
  return 0;
  case EXIT_REASON_WBINVD:
  return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
--
1.8.2.1.610.g562af5b
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 05/12] Subject: [PATCH 05/10] nEPT: Fix wrong test in kvm_set_cr3

2013-04-25 Thread Nakajima, Jun

kvm_set_cr3() attempts to check if the new cr3 is a valid guest physical
address. The problem is that with nested EPT, cr3 is an *L2* physical
address, not an L1 physical address as this test expects.

As the comment above this test explains, it isn't necessary, and doesn't
correspond to anything a real processor would do. So this patch removes it.

Note that this wrong test could have also theoretically caused problems
in nested NPT, not just in nested EPT. However, in practice, the problem
was avoided: nested_svm_vmexit()/vmrun() do not call kvm_set_cr3 in the
nested NPT case, and instead set the vmcb (and arch.cr3) directly, thus
circumventing the problem. Additional potential calls to the buggy function
are avoided in that we don't trap cr3 modifications when nested NPT is
enabled. However, because in nested VMX we did want to use kvm_set_cr3()
(as requested in Avi Kivity's review of the original nested VMX patches),
we can't avoid this problem and need to fix it.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 

modified:   arch/x86/kvm/x86.c
---
 arch/x86/kvm/x86.c | 11 ---
 1 file changed, 11 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e172132..c34590d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -659,17 +659,6 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
  */
  }

- /*
- * Does the new cr3 value map to physical memory? (Note, we
- * catch an invalid cr3 even in real-mode, because it would
- * cause trouble later on when we turn on paging anyway.)
- *
- * A real CPU would silently accept an invalid cr3 and would
- * attempt to use it - with largely undefined (and often hard
- * to debug) behavior on the guest side.
- */
- if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
- return 1;
  vcpu->arch.cr3 = cr3;
  __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
  vcpu->arch.mmu.new_cr3(vcpu);
--
1.8.2.1.610.g562af5b
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 04/12] Subject: [PATCH 04/10] nEPT: Fix cr3 handling in nested exit and entry

2013-04-25 Thread Nakajima, Jun

The existing code for handling cr3 and related VMCS fields during nested
exit and entry wasn't correct in all cases:

If L2 is allowed to control cr3 (and this is indeed the case in nested EPT),
during nested exit we must copy the modified cr3 from vmcs02 to vmcs12, and
we forgot to do so. This patch adds this copy.

If L0 isn't controlling cr3 when running L2 (i.e., L0 is using EPT), and
whoever does control cr3 (L1 or L2) is using PAE, the processor might have
saved PDPTEs and we should also save them in vmcs12 (and restore later).

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 

modified:   arch/x86/kvm/vmx.c
---
 arch/x86/kvm/vmx.c | 37 -
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f2fd79d..d4bfd32 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7162,10 +7162,26 @@ static void prepare_vmcs02(struct kvm_vcpu
*vcpu, struct vmcs12 *vmcs12)
  vmx_set_cr4(vcpu, vmcs12->guest_cr4);
  vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));

- /* shadow page tables on either EPT or shadow page tables */
+ /*
+ * Note that kvm_set_cr3() and kvm_mmu_reset_context() will do the
+ * right thing, and set GUEST_CR3 and/or EPT_POINTER in all supported
+ * settings: 1. shadow page tables on shadow page tables, 2. shadow
+ * page tables on EPT, 3. EPT on EPT.
+ */
  kvm_set_cr3(vcpu, vmcs12->guest_cr3);
  kvm_mmu_reset_context(vcpu);

+ /*
+ * Additionally, except when L0 is using shadow page tables, L1 or
+ * L2 control guest_cr3 for L2, so they may also have saved PDPTEs
+ */
+ if (enable_ept) {
+ vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+ vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+ vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+ vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+ }
+
  kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
  kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
 }
@@ -7397,6 +7413,25 @@ void prepare_vmcs12(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
  vmcs12->guest_pending_dbg_exceptions =
  vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);

+ /*
+ * In some cases (usually, nested EPT), L2 is allowed to change its
+ * own CR3 without exiting. If it has changed it, we must keep it.
+ * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
+ * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
+ */
+ if (enable_ept)
+ vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3);
+ /*
+ * Additionally, except when L0 is using shadow page tables, L1 or
+ * L2 control guest_cr3 for L2, so save their PDPTEs
+ */
+ if (enable_ept) {
+ vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+ vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+ vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+ vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+ }
+
  /* TODO: These cannot have changed unless we have MSR bitmaps and
  * the relevant bit asks not to trap the change */
  vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
--
1.8.2.1.610.g562af5b
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 03/12] Subject: [PATCH 03/10] nEPT: MMU context for nested EPT

2013-04-25 Thread Nakajima, Jun

KVM's existing shadow MMU code already supports nested TDP. To use it, we
need to set up a new "MMU context" for nested EPT, and create a few callbacks
for it (nested_ept_*()). This context should also use the EPT versions of
the page table access functions (defined in the previous patch).
Then, we need to switch back and forth between this nested context and the
regular MMU context when switching between L1 and L2 (when L1 runs this L2
with EPT).

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 

modified:   arch/x86/kvm/mmu.c
modified:   arch/x86/kvm/mmu.h
modified:   arch/x86/kvm/vmx.c
---
 arch/x86/kvm/mmu.c | 38 
 arch/x86/kvm/mmu.h |  1 +
 arch/x86/kvm/vmx.c | 56 +++---
 3 files changed, 92 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 91cac19..34e406e2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3674,6 +3674,44 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
struct kvm_mmu *context)
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);

+int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+{
+ ASSERT(vcpu);
+ ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+
+ context->nx = is_nx(vcpu); /* TODO: ? */
+ context->new_cr3 = paging_new_cr3;
+ context->page_fault = EPT_page_fault;
+ context->gva_to_gpa = EPT_gva_to_gpa;
+ context->sync_page = EPT_sync_page;
+ context->invlpg = EPT_invlpg;
+ context->update_pte = EPT_update_pte;
+ context->free = paging_free;
+ context->root_level = context->shadow_root_level;
+ context->root_hpa = INVALID_PAGE;
+ context->direct_map = false;
+
+ /* TODO: reset_rsvds_bits_mask() is not built for EPT, we need
+   something different.
+ */
+ reset_rsvds_bits_mask(vcpu, context);
+
+
+ /* TODO: I copied these from kvm_init_shadow_mmu, I don't know why
+   they are done, or why they write to vcpu->arch.mmu and not context
+ */
+ vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
+ vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
+ vcpu->arch.mmu.base_role.smep_andnot_wp =
+ kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) &&
+ !is_write_protection(vcpu);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_EPT_mmu);
+
 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
  int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 6987108..19dd5ab 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -54,6 +54,7 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu
*vcpu, u64 addr, u64 sptes[4]);
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr,
bool direct);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);

 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9e0ec9d..f2fd79d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -912,12 +912,16 @@ static inline bool nested_cpu_has2(struct vmcs12
*vmcs12, u32 bit)
  (vmcs12->secondary_vm_exec_control & bit);
 }

-static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
- struct kvm_vcpu *vcpu)
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
 {
  return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
 }

+static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
  return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -6873,6 +6877,46 @@ static void vmx_set_supported_cpuid(u32 func,
struct kvm_cpuid_entry2 *entry)
  entry->ecx |= bit(X86_FEATURE_VMX);
 }

+/* Callbacks for nested_ept_init_mmu_context: */
+
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
+{
+ /* return the page table to be shadowed - in our case, EPT12 */
+ return get_vmcs12(vcpu)->ept_pointer;
+}
+
+static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
+{
+ struct vmcs12 *vmcs12;
+ nested_vmx_vmexit(vcpu);
+ vmcs12 = get_vmcs12(vcpu);
+ /*
+ * Note no need to set vmcs12->vm_exit_reason as it is already copied
+ * from vmcs02 in nested_vmx_vmexit() above, i.e., EPT_VIOLATION.
+ */
+ vmcs12->exit_qualification = fault->error_code;
+ vmcs12->guest_physical_address = fault->address;
+}
+
+static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+ int r = kvm_init_shadow_EPT_mmu(vcpu, &vcpu->arch.mmu);
+
+ vcpu->arch.mmu.set_cr3   = vmx_set_cr3;
+ vcpu->arch.mmu.get_cr3   = nested_ept_get_cr3;
+ vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
+
+ vcpu->arch.walk_mmu  = &vcpu->arch.nested_mmu;
+
+ return r;
+}
+
+static void nested_ept_un

[PATCH 02/12] Subject: [PATCH 02/10] nEPT: Add EPT tables support to paging_tmpl.h

2013-04-25 Thread Nakajima, Jun

This is the first patch in a series which adds nested EPT support to KVM's
nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
to set its own cr3 and take its own page faults without either of L0 or L1
getting involved. This often significanlty improves L2's performance over the
previous two alternatives (shadow page tables over EPT, and shadow page
tables over shadow page tables).

This patch adds EPT support to paging_tmpl.h.

paging_tmpl.h contains the code for reading and writing page tables. The code
for 32-bit and 64-bit tables is very similar, but not identical, so
paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
with PTTYPE=64, and this generates the two sets of similar functions.

There are subtle but important differences between the format of EPT tables
and that of ordinary x86 64-bit page tables, so for nested EPT we need a
third set of functions to read the guest EPT table and to write the shadow
EPT table.

So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
with "EPT") which correctly read and write EPT tables.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 

modified:   arch/x86/kvm/mmu.c
modified:   arch/x86/kvm/paging_tmpl.h
---
 arch/x86/kvm/mmu.c |   5 ++
 arch/x86/kvm/paging_tmpl.h | 135 ++---
 2 files changed, 131 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 956ca35..91cac19 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3418,6 +3418,11 @@ static inline bool is_last_gpte(struct kvm_mmu
*mmu, unsigned level, unsigned gp
  return mmu->last_pte_bitmap & (1 << index);
 }

+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
+
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 105dd5b..6226b51 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -50,6 +50,22 @@
  #define PT_LEVEL_BITS PT32_LEVEL_BITS
  #define PT_MAX_FULL_LEVELS 2
  #define CMPXCHG cmpxchg
+#elif PTTYPE == PTTYPE_EPT
+ #define pt_element_t u64
+ #define guest_walker guest_walkerEPT
+ #define FNAME(name) EPT_##name
+ #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+ #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+ #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+ #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+ #define PT_LEVEL_BITS PT64_LEVEL_BITS
+ #ifdef CONFIG_X86_64
+ #define PT_MAX_FULL_LEVELS 4
+ #define CMPXCHG cmpxchg
+ #else
+ #define CMPXCHG cmpxchg64
+ #define PT_MAX_FULL_LEVELS 2
+ #endif
 #else
  #error Invalid PTTYPE value
 #endif
@@ -80,6 +96,7 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
  return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
 }

+#if PTTYPE != PTTYPE_EPT
 static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
pt_element_t __user *ptep_user, unsigned index,
pt_element_t orig_pte, pt_element_t new_pte)
@@ -102,7 +119,52 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu
*vcpu, struct kvm_mmu *mmu,

  return (ret != orig_pte);
 }
+#endif
+
+static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+ unsigned access;
+
+#if PTTYPE == PTTYPE_EPT
+ /* We rely here that ACC_WRITE_MASK==VMX_EPT_WRITABLE_MASK */
+ access = (gpte & VMX_EPT_WRITABLE_MASK) | ACC_USER_MASK |
+ ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0);
+#else
+ access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+ access &= ~(gpte >> PT64_NX_SHIFT);
+#endif
+
+ return access;
+}
+
+static inline int FNAME(is_present_gpte)(unsigned long pte)
+{
+#if PTTYPE == PTTYPE_EPT
+ return pte & (VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK |
+ VMX_EPT_EXECUTABLE_MASK);
+#else
+ return is_present_gpte(pte);
+#endif
+}
+
+static inline int FNAME(check_write_user_access)(struct kvm_vcpu *vcpu,
+   bool write_fault, bool user_fault,
+   unsigned long pte)
+{
+#if PTTYPE == PTTYPE_EPT
+ if (unlikely(write_fault && !(pte & VMX_EPT_WRITABLE_MASK)
+ && (user_fault || is_write_protection(vcpu
+ return false;
+ return true;
+#else
+ u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
+| (write_fault ? PFERR_WRITE_MASK : 0);
+
+ return !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access);
+#endif
+}

+#if PTTYPE != PTTYPE_EPT
 static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
  struct kvm_mmu *mmu,
  struct guest_walker *walker,
@@ -139,6 +201,7 @@ static int
FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
  }
  return 0;
 }
+#endif

 /*
  * Fetch a guest pte for a guest virtual address
@@ -147,7 +210,6 @@ static int FNAME(walk_addr_generic)(struct
guest_walker *walker,
 struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 gva_t addr

[PATCH 01/12] Subject: [PATCH 01/10] nEPT: Support LOAD_IA32_EFER entry/exit controls for L1

2013-04-25 Thread Nakajima, Jun

Recent KVM, since http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577
switch the EFER MSR when EPT is used and the host and guest have different
NX bits. So if we add support for nested EPT (L1 guest using EPT to run L2)
and want to be able to run recent KVM as L1, we need to allow L1 to use this
EFER switching feature.

To do this EFER switching, KVM uses VM_ENTRY/EXIT_LOAD_IA32_EFER if available,
and if it isn't, it uses the generic VM_ENTRY/EXIT_MSR_LOAD. This patch adds
support for the former (the latter is still unsupported).

Nested entry and exit emulation (prepare_vmcs_02 and load_vmcs12_host_state,
respectively) already handled VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So all
that's left to do in this patch is to properly advertise this feature to L1.

Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are emulated by L0, by using
vmx_set_efer (which itself sets one of several vmcs02 fields), so we always
support this feature, regardless of whether the host supports it.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 

modified:   arch/x86/kvm/vmx.c
---
 arch/x86/kvm/vmx.c | 18 ++
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6667042..9e0ec9d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2057,6 +2057,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 #else
  nested_vmx_exit_ctls_high = 0;
 #endif
+ nested_vmx_exit_ctls_high |= VM_EXIT_LOAD_IA32_EFER;

  /* entry controls */
  rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2064,6 +2065,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
  nested_vmx_entry_ctls_low = 0;
  nested_vmx_entry_ctls_high &=
  VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
+ nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_IA32_EFER;

  /* cpu-based controls */
  rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -7050,10 +7052,18 @@ static void prepare_vmcs02(struct kvm_vcpu
*vcpu, struct vmcs12 *vmcs12)
  vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
  vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);

- /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
- vmcs_write32(VM_EXIT_CONTROLS,
- vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
- vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
+ /* L2->L1 exit controls are emulated - the hardware exit is to L0 so
+ * we should use its exit controls. Note that IA32_MODE, LOAD_IA32_EFER
+ * bits are further modified by vmx_set_efer() below.
+ */
+ vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+
+ /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
+ * emulated by vmx_set_efer(), below.
+ */
+ vmcs_write32(VM_ENTRY_CONTROLS,
+ (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
+ ~VM_ENTRY_IA32E_MODE) |
  (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));

  if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
--
1.8.2.1.610.g562af5b
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v11 4/4] tcm_vhost: Enable VIRTIO_SCSI_F_HOTPLUG

2013-04-25 Thread Michael S. Tsirkin

On Thu, Apr 25, 2013 at 03:35:23PM +0800, Asias He wrote:
> Everything for hotplug is ready. Let's enable the feature bit.
> 
> Signed-off-by: Asias He 

Acked-by: Michael S. Tsirkin 

> ---
>  drivers/vhost/tcm_vhost.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
> index 07217d8..1677238 100644
> --- a/drivers/vhost/tcm_vhost.c
> +++ b/drivers/vhost/tcm_vhost.c
> @@ -66,7 +66,8 @@ enum {
>   * TODO: debug and remove the workaround.
>   */
>  enum {
> - VHOST_SCSI_FEATURES = VHOST_FEATURES & (~VIRTIO_RING_F_EVENT_IDX)
> + VHOST_SCSI_FEATURES = (VHOST_FEATURES & (~VIRTIO_RING_F_EVENT_IDX)) |
> +   (1ULL << VIRTIO_SCSI_F_HOTPLUG)
>  };
>  
>  #define VHOST_SCSI_MAX_TARGET256
> -- 
> 1.8.1.4
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v11 3/4] tcm_vhost: Add ioctl to get and set events missed flag

2013-04-25 Thread Michael S. Tsirkin

On Thu, Apr 25, 2013 at 03:35:22PM +0800, Asias He wrote:
> Signed-off-by: Asias He 

Acked-by: Michael S. Tsirkin 

> ---
>  drivers/vhost/tcm_vhost.c | 17 +
>  drivers/vhost/tcm_vhost.h |  3 +++
>  2 files changed, 20 insertions(+)
> 
> diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
> index 5340fd7..07217d8 100644
> --- a/drivers/vhost/tcm_vhost.c
> +++ b/drivers/vhost/tcm_vhost.c
> @@ -1200,8 +1200,11 @@ static long vhost_scsi_ioctl(struct file *f, unsigned 
> int ioctl,
>   struct vhost_scsi_target backend;
>   void __user *argp = (void __user *)arg;
>   u64 __user *featurep = argp;
> + u32 __user *eventsp = argp;
> + u32 events_missed;
>   u64 features;
>   int r, abi_version = VHOST_SCSI_ABI_VERSION;
> + struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
>  
>   switch (ioctl) {
>   case VHOST_SCSI_SET_ENDPOINT:
> @@ -1222,6 +1225,20 @@ static long vhost_scsi_ioctl(struct file *f, unsigned 
> int ioctl,
>   if (copy_to_user(argp, &abi_version, sizeof abi_version))
>   return -EFAULT;
>   return 0;
> + case VHOST_SCSI_SET_EVENTS_MISSED:
> + if (get_user(events_missed, eventsp))
> + return -EFAULT;
> + mutex_lock(&vq->mutex);
> + vs->vs_events_missed = events_missed;
> + mutex_unlock(&vq->mutex);
> + return 0;
> + case VHOST_SCSI_GET_EVENTS_MISSED:
> + mutex_lock(&vq->mutex);
> + events_missed = vs->vs_events_missed;
> + mutex_unlock(&vq->mutex);
> + if (put_user(events_missed, eventsp))
> + return -EFAULT;
> + return 0;
>   case VHOST_GET_FEATURES:
>   features = VHOST_SCSI_FEATURES;
>   if (copy_to_user(featurep, &features, sizeof features))
> diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
> index a545a5b..514b9fd 100644
> --- a/drivers/vhost/tcm_vhost.h
> +++ b/drivers/vhost/tcm_vhost.h
> @@ -123,3 +123,6 @@ struct vhost_scsi_target {
>  #define VHOST_SCSI_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x41, struct 
> vhost_scsi_target)
>  /* Changing this breaks userspace. */
>  #define VHOST_SCSI_GET_ABI_VERSION _IOW(VHOST_VIRTIO, 0x42, int)
> +/* Set and get the events missed flag */
> +#define VHOST_SCSI_SET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x43, __u32)
> +#define VHOST_SCSI_GET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x44, __u32)
> -- 
> 1.8.1.4
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v11 2/4] tcm_vhost: Add hotplug/hotunplug support

2013-04-25 Thread Michael S. Tsirkin

On Thu, Apr 25, 2013 at 03:35:21PM +0800, Asias He wrote:
> In commit 365a7150094 ([SCSI] virtio-scsi: hotplug support for
> virtio-scsi), hotplug support is added to virtio-scsi.
> 
> This patch adds hotplug and hotunplug support to tcm_vhost.
> 
> You can create or delete a LUN in targetcli to hotplug or hotunplug a
> LUN in guest.
> 
> Signed-off-by: Asias He 
> Reviewed-by: Stefan Hajnoczi 

Acked-by: Michael S. Tsirkin 

> ---
>  drivers/vhost/tcm_vhost.c | 210 
> +-
>  drivers/vhost/tcm_vhost.h |  10 +++
>  2 files changed, 218 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
> index 822cd1f..5340fd7 100644
> --- a/drivers/vhost/tcm_vhost.c
> +++ b/drivers/vhost/tcm_vhost.c
> @@ -71,6 +71,7 @@ enum {
>  
>  #define VHOST_SCSI_MAX_TARGET256
>  #define VHOST_SCSI_MAX_VQ128
> +#define VHOST_SCSI_MAX_EVENT 128
>  
>  struct vhost_scsi {
>   /* Protected by vhost_scsi->dev.mutex */
> @@ -82,6 +83,12 @@ struct vhost_scsi {
>  
>   struct vhost_work vs_completion_work; /* cmd completion work item */
>   struct llist_head vs_completion_list; /* cmd completion queue */
> +
> + struct vhost_work vs_event_work; /* evt injection work item */
> + struct llist_head vs_event_list; /* evt injection queue */
> +
> + bool vs_events_missed; /* any missed events, protected by vq->mutex */
> + int vs_events_nr; /* num of pending events, protected by vq->mutex */
>  };
>  
>  /* Local pointer to allocated TCM configfs fabric module */
> @@ -349,6 +356,37 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
>   return 0;
>  }
>  
> +static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt 
> *evt)
> +{
> + vs->vs_events_nr--;
> + kfree(evt);
> +}
> +
> +static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
> + u32 event, u32 reason)
> +{
> + struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
> + struct tcm_vhost_evt *evt;
> +
> + if (vs->vs_events_nr > VHOST_SCSI_MAX_EVENT) {
> + vs->vs_events_missed = true;
> + return NULL;
> + }
> +
> + evt = kzalloc(sizeof(*evt), GFP_KERNEL);
> + if (!evt) {
> + vq_err(vq, "Failed to allocate tcm_vhost_evt\n");
> + vs->vs_events_missed = true;
> + return NULL;
> + }
> +
> + evt->event.event = event;
> + evt->event.reason = reason;
> + vs->vs_events_nr++;
> +
> + return evt;
> +}
> +
>  static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
>  {
>   struct se_cmd *se_cmd = &tv_cmd->tvc_se_cmd;
> @@ -367,6 +405,75 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd 
> *tv_cmd)
>   kfree(tv_cmd);
>  }
>  
> +static void tcm_vhost_do_evt_work(struct vhost_scsi *vs,
> + struct tcm_vhost_evt *evt)
> +{
> + struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
> + struct virtio_scsi_event *event = &evt->event;
> + struct virtio_scsi_event __user *eventp;
> + unsigned out, in;
> + int head, ret;
> +
> + if (!vq->private_data) {
> + vs->vs_events_missed = true;
> + return;
> + }
> +
> +again:
> + vhost_disable_notify(&vs->dev, vq);
> + head = vhost_get_vq_desc(&vs->dev, vq, vq->iov,
> + ARRAY_SIZE(vq->iov), &out, &in,
> + NULL, NULL);
> + if (head < 0) {
> + vs->vs_events_missed = true;
> + return;
> + }
> + if (head == vq->num) {
> + if (vhost_enable_notify(&vs->dev, vq))
> + goto again;
> + vs->vs_events_missed = true;
> + return;
> + }
> +
> + if ((vq->iov[out].iov_len != sizeof(struct virtio_scsi_event))) {
> + vq_err(vq, "Expecting virtio_scsi_event, got %zu bytes\n",
> + vq->iov[out].iov_len);
> + vs->vs_events_missed = true;
> + return;
> + }
> +
> + if (vs->vs_events_missed) {
> + event->event |= VIRTIO_SCSI_T_EVENTS_MISSED;
> + vs->vs_events_missed = false;
> + }
> +
> + eventp = vq->iov[out].iov_base;
> + ret = __copy_to_user(eventp, event, sizeof(*event));
> + if (!ret)
> + vhost_add_used_and_signal(&vs->dev, vq, head, 0);
> + else
> + vq_err(vq, "Faulted on tcm_vhost_send_event\n");
> +}
> +
> +static void tcm_vhost_evt_work(struct vhost_work *work)
> +{
> + struct vhost_scsi *vs = container_of(work, struct vhost_scsi,
> + vs_event_work);
> + struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
> + struct tcm_vhost_evt *evt;
> + struct llist_node *llnode;
> +
> + mutex_lock(&vq->mutex);
> + llnode = llist_del_all(&vs->vs_event_list);
> + while (llnode) {
> + evt = llist_entry(llnode, struct tcm_vhost_evt, list);
> + llnode = lli

Re: [PATCH v11 1/4] tcm_vhost: Refactor the lock nesting rule

2013-04-25 Thread Michael S. Tsirkin

On Thu, Apr 25, 2013 at 03:35:20PM +0800, Asias He wrote:
> We want to use tcm_vhost_mutex to make sure hotplug/hotunplug will not
> happen when set_endpoint/clear_endpoint is in process.
> 
> Signed-off-by: Asias He 

Acked-by: Michael S. Tsirkin 

> ---
>  drivers/vhost/tcm_vhost.c | 32 +++-
>  1 file changed, 19 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
> index 957a0b9..822cd1f 100644
> --- a/drivers/vhost/tcm_vhost.c
> +++ b/drivers/vhost/tcm_vhost.c
> @@ -808,6 +808,9 @@ static void vhost_scsi_flush(struct vhost_scsi *vs)
>  /*
>   * Called from vhost_scsi_ioctl() context to walk the list of available
>   * tcm_vhost_tpg with an active struct tcm_vhost_nexus
> + *
> + *  The lock nesting rule is:
> + *tcm_vhost_mutex -> vs->dev.mutex -> tpg->tv_tpg_mutex -> vq->mutex
>   */
>  static int vhost_scsi_set_endpoint(
>   struct vhost_scsi *vs,
> @@ -820,26 +823,27 @@ static int vhost_scsi_set_endpoint(
>   int index, ret, i, len;
>   bool match = false;
>  
> + mutex_lock(&tcm_vhost_mutex);
>   mutex_lock(&vs->dev.mutex);
> +
>   /* Verify that ring has been setup correctly. */
>   for (index = 0; index < vs->dev.nvqs; ++index) {
>   /* Verify that ring has been setup correctly. */
>   if (!vhost_vq_access_ok(&vs->vqs[index])) {
> - mutex_unlock(&vs->dev.mutex);
> - return -EFAULT;
> + ret = -EFAULT;
> + goto out;
>   }
>   }
>  
>   len = sizeof(vs_tpg[0]) * VHOST_SCSI_MAX_TARGET;
>   vs_tpg = kzalloc(len, GFP_KERNEL);
>   if (!vs_tpg) {
> - mutex_unlock(&vs->dev.mutex);
> - return -ENOMEM;
> + ret = -ENOMEM;
> + goto out;
>   }
>   if (vs->vs_tpg)
>   memcpy(vs_tpg, vs->vs_tpg, len);
>  
> - mutex_lock(&tcm_vhost_mutex);
>   list_for_each_entry(tv_tpg, &tcm_vhost_list, tv_tpg_list) {
>   mutex_lock(&tv_tpg->tv_tpg_mutex);
>   if (!tv_tpg->tpg_nexus) {
> @@ -854,11 +858,10 @@ static int vhost_scsi_set_endpoint(
>  
>   if (!strcmp(tv_tport->tport_name, t->vhost_wwpn)) {
>   if (vs->vs_tpg && vs->vs_tpg[tv_tpg->tport_tpgt]) {
> - mutex_unlock(&tv_tpg->tv_tpg_mutex);
> - mutex_unlock(&tcm_vhost_mutex);
> - mutex_unlock(&vs->dev.mutex);
>   kfree(vs_tpg);
> - return -EEXIST;
> + mutex_unlock(&tv_tpg->tv_tpg_mutex);
> + ret = -EEXIST;
> + goto out;
>   }
>   tv_tpg->tv_tpg_vhost_count++;
>   vs_tpg[tv_tpg->tport_tpgt] = tv_tpg;
> @@ -867,7 +870,6 @@ static int vhost_scsi_set_endpoint(
>   }
>   mutex_unlock(&tv_tpg->tv_tpg_mutex);
>   }
> - mutex_unlock(&tcm_vhost_mutex);
>  
>   if (match) {
>   memcpy(vs->vs_vhost_wwpn, t->vhost_wwpn,
> @@ -893,7 +895,9 @@ static int vhost_scsi_set_endpoint(
>   kfree(vs->vs_tpg);
>   vs->vs_tpg = vs_tpg;
>  
> +out:
>   mutex_unlock(&vs->dev.mutex);
> + mutex_unlock(&tcm_vhost_mutex);
>   return ret;
>  }
>  
> @@ -908,6 +912,7 @@ static int vhost_scsi_clear_endpoint(
>   int index, ret, i;
>   u8 target;
>  
> + mutex_lock(&tcm_vhost_mutex);
>   mutex_lock(&vs->dev.mutex);
>   /* Verify that ring has been setup correctly. */
>   for (index = 0; index < vs->dev.nvqs; ++index) {
> @@ -918,8 +923,8 @@ static int vhost_scsi_clear_endpoint(
>   }
>  
>   if (!vs->vs_tpg) {
> - mutex_unlock(&vs->dev.mutex);
> - return 0;
> + ret = 0;
> + goto err_dev;
>   }
>  
>   for (i = 0; i < VHOST_SCSI_MAX_TARGET; i++) {
> @@ -965,13 +970,14 @@ static int vhost_scsi_clear_endpoint(
>   kfree(vs->vs_tpg);
>   vs->vs_tpg = NULL;
>   mutex_unlock(&vs->dev.mutex);
> -
> + mutex_unlock(&tcm_vhost_mutex);
>   return 0;
>  
>  err_tpg:
>   mutex_unlock(&tv_tpg->tv_tpg_mutex);
>  err_dev:
>   mutex_unlock(&vs->dev.mutex);
> + mutex_unlock(&tcm_vhost_mutex);
>   return ret;
>  }
>  
> -- 
> 1.8.1.4
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v11 0/4] tcm_vhost hotplug

2013-04-25 Thread Michael S. Tsirkin

On Thu, Apr 25, 2013 at 03:35:19PM +0800, Asias He wrote:
> Changes in v11
> - Drop change log histroy in commit log
> 
> Changes in v10
> - Drop comments about lun
> - Add Enable VIRTIO_SCSI_F_HOTPLUG to this series
> 
> Changes in v9
> - Drop tcm_vhost_check_feature
> - Add Refactor the lock nesting rule to this sereis
> 
> Asias He (4):
>   tcm_vhost: Refactor the lock nesting rule
>   tcm_vhost: Add hotplug/hotunplug support
>   tcm_vhost: Add ioctl to get and set events missed flag
>   tcm_vhost: Enable VIRTIO_SCSI_F_HOTPLUG
> 
>  drivers/vhost/tcm_vhost.c | 262 
> +++---
>  drivers/vhost/tcm_vhost.h |  13 +++
>  2 files changed, 259 insertions(+), 16 deletions(-)


Acked-by: Michael S. Tsirkin 

> -- 
> 1.8.1.4
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v11 4/4] tcm_vhost: Enable VIRTIO_SCSI_F_HOTPLUG

2013-04-25 Thread Asias He

Everything for hotplug is ready. Let's enable the feature bit.

Signed-off-by: Asias He 
---
 drivers/vhost/tcm_vhost.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
index 07217d8..1677238 100644
--- a/drivers/vhost/tcm_vhost.c
+++ b/drivers/vhost/tcm_vhost.c
@@ -66,7 +66,8 @@ enum {
  * TODO: debug and remove the workaround.
  */
 enum {
-   VHOST_SCSI_FEATURES = VHOST_FEATURES & (~VIRTIO_RING_F_EVENT_IDX)
+   VHOST_SCSI_FEATURES = (VHOST_FEATURES & (~VIRTIO_RING_F_EVENT_IDX)) |
+ (1ULL << VIRTIO_SCSI_F_HOTPLUG)
 };
 
 #define VHOST_SCSI_MAX_TARGET  256
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v11 3/4] tcm_vhost: Add ioctl to get and set events missed flag

2013-04-25 Thread Asias He

Signed-off-by: Asias He 
---
 drivers/vhost/tcm_vhost.c | 17 +
 drivers/vhost/tcm_vhost.h |  3 +++
 2 files changed, 20 insertions(+)

diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
index 5340fd7..07217d8 100644
--- a/drivers/vhost/tcm_vhost.c
+++ b/drivers/vhost/tcm_vhost.c
@@ -1200,8 +1200,11 @@ static long vhost_scsi_ioctl(struct file *f, unsigned 
int ioctl,
struct vhost_scsi_target backend;
void __user *argp = (void __user *)arg;
u64 __user *featurep = argp;
+   u32 __user *eventsp = argp;
+   u32 events_missed;
u64 features;
int r, abi_version = VHOST_SCSI_ABI_VERSION;
+   struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
 
switch (ioctl) {
case VHOST_SCSI_SET_ENDPOINT:
@@ -1222,6 +1225,20 @@ static long vhost_scsi_ioctl(struct file *f, unsigned 
int ioctl,
if (copy_to_user(argp, &abi_version, sizeof abi_version))
return -EFAULT;
return 0;
+   case VHOST_SCSI_SET_EVENTS_MISSED:
+   if (get_user(events_missed, eventsp))
+   return -EFAULT;
+   mutex_lock(&vq->mutex);
+   vs->vs_events_missed = events_missed;
+   mutex_unlock(&vq->mutex);
+   return 0;
+   case VHOST_SCSI_GET_EVENTS_MISSED:
+   mutex_lock(&vq->mutex);
+   events_missed = vs->vs_events_missed;
+   mutex_unlock(&vq->mutex);
+   if (put_user(events_missed, eventsp))
+   return -EFAULT;
+   return 0;
case VHOST_GET_FEATURES:
features = VHOST_SCSI_FEATURES;
if (copy_to_user(featurep, &features, sizeof features))
diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
index a545a5b..514b9fd 100644
--- a/drivers/vhost/tcm_vhost.h
+++ b/drivers/vhost/tcm_vhost.h
@@ -123,3 +123,6 @@ struct vhost_scsi_target {
 #define VHOST_SCSI_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x41, struct 
vhost_scsi_target)
 /* Changing this breaks userspace. */
 #define VHOST_SCSI_GET_ABI_VERSION _IOW(VHOST_VIRTIO, 0x42, int)
+/* Set and get the events missed flag */
+#define VHOST_SCSI_SET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x43, __u32)
+#define VHOST_SCSI_GET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x44, __u32)
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v11 2/4] tcm_vhost: Add hotplug/hotunplug support

2013-04-25 Thread Asias He

In commit 365a7150094 ([SCSI] virtio-scsi: hotplug support for
virtio-scsi), hotplug support is added to virtio-scsi.

This patch adds hotplug and hotunplug support to tcm_vhost.

You can create or delete a LUN in targetcli to hotplug or hotunplug a
LUN in guest.

Signed-off-by: Asias He 
Reviewed-by: Stefan Hajnoczi 
---
 drivers/vhost/tcm_vhost.c | 210 +-
 drivers/vhost/tcm_vhost.h |  10 +++
 2 files changed, 218 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
index 822cd1f..5340fd7 100644
--- a/drivers/vhost/tcm_vhost.c
+++ b/drivers/vhost/tcm_vhost.c
@@ -71,6 +71,7 @@ enum {
 
 #define VHOST_SCSI_MAX_TARGET  256
 #define VHOST_SCSI_MAX_VQ  128
+#define VHOST_SCSI_MAX_EVENT   128
 
 struct vhost_scsi {
/* Protected by vhost_scsi->dev.mutex */
@@ -82,6 +83,12 @@ struct vhost_scsi {
 
struct vhost_work vs_completion_work; /* cmd completion work item */
struct llist_head vs_completion_list; /* cmd completion queue */
+
+   struct vhost_work vs_event_work; /* evt injection work item */
+   struct llist_head vs_event_list; /* evt injection queue */
+
+   bool vs_events_missed; /* any missed events, protected by vq->mutex */
+   int vs_events_nr; /* num of pending events, protected by vq->mutex */
 };
 
 /* Local pointer to allocated TCM configfs fabric module */
@@ -349,6 +356,37 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
return 0;
 }
 
+static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt 
*evt)
+{
+   vs->vs_events_nr--;
+   kfree(evt);
+}
+
+static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
+   u32 event, u32 reason)
+{
+   struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
+   struct tcm_vhost_evt *evt;
+
+   if (vs->vs_events_nr > VHOST_SCSI_MAX_EVENT) {
+   vs->vs_events_missed = true;
+   return NULL;
+   }
+
+   evt = kzalloc(sizeof(*evt), GFP_KERNEL);
+   if (!evt) {
+   vq_err(vq, "Failed to allocate tcm_vhost_evt\n");
+   vs->vs_events_missed = true;
+   return NULL;
+   }
+
+   evt->event.event = event;
+   evt->event.reason = reason;
+   vs->vs_events_nr++;
+
+   return evt;
+}
+
 static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
 {
struct se_cmd *se_cmd = &tv_cmd->tvc_se_cmd;
@@ -367,6 +405,75 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd 
*tv_cmd)
kfree(tv_cmd);
 }
 
+static void tcm_vhost_do_evt_work(struct vhost_scsi *vs,
+   struct tcm_vhost_evt *evt)
+{
+   struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
+   struct virtio_scsi_event *event = &evt->event;
+   struct virtio_scsi_event __user *eventp;
+   unsigned out, in;
+   int head, ret;
+
+   if (!vq->private_data) {
+   vs->vs_events_missed = true;
+   return;
+   }
+
+again:
+   vhost_disable_notify(&vs->dev, vq);
+   head = vhost_get_vq_desc(&vs->dev, vq, vq->iov,
+   ARRAY_SIZE(vq->iov), &out, &in,
+   NULL, NULL);
+   if (head < 0) {
+   vs->vs_events_missed = true;
+   return;
+   }
+   if (head == vq->num) {
+   if (vhost_enable_notify(&vs->dev, vq))
+   goto again;
+   vs->vs_events_missed = true;
+   return;
+   }
+
+   if ((vq->iov[out].iov_len != sizeof(struct virtio_scsi_event))) {
+   vq_err(vq, "Expecting virtio_scsi_event, got %zu bytes\n",
+   vq->iov[out].iov_len);
+   vs->vs_events_missed = true;
+   return;
+   }
+
+   if (vs->vs_events_missed) {
+   event->event |= VIRTIO_SCSI_T_EVENTS_MISSED;
+   vs->vs_events_missed = false;
+   }
+
+   eventp = vq->iov[out].iov_base;
+   ret = __copy_to_user(eventp, event, sizeof(*event));
+   if (!ret)
+   vhost_add_used_and_signal(&vs->dev, vq, head, 0);
+   else
+   vq_err(vq, "Faulted on tcm_vhost_send_event\n");
+}
+
+static void tcm_vhost_evt_work(struct vhost_work *work)
+{
+   struct vhost_scsi *vs = container_of(work, struct vhost_scsi,
+   vs_event_work);
+   struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
+   struct tcm_vhost_evt *evt;
+   struct llist_node *llnode;
+
+   mutex_lock(&vq->mutex);
+   llnode = llist_del_all(&vs->vs_event_list);
+   while (llnode) {
+   evt = llist_entry(llnode, struct tcm_vhost_evt, list);
+   llnode = llist_next(llnode);
+   tcm_vhost_do_evt_work(vs, evt);
+   tcm_vhost_free_evt(vs, evt);
+   }
+   mutex_unlock(&vq->mutex);
+}
+
 /* Fill in status and signal that we are done processing this command

[PATCH v11 1/4] tcm_vhost: Refactor the lock nesting rule

2013-04-25 Thread Asias He

We want to use tcm_vhost_mutex to make sure hotplug/hotunplug will not
happen when set_endpoint/clear_endpoint is in process.

Signed-off-by: Asias He 
---
 drivers/vhost/tcm_vhost.c | 32 +++-
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
index 957a0b9..822cd1f 100644
--- a/drivers/vhost/tcm_vhost.c
+++ b/drivers/vhost/tcm_vhost.c
@@ -808,6 +808,9 @@ static void vhost_scsi_flush(struct vhost_scsi *vs)
 /*
  * Called from vhost_scsi_ioctl() context to walk the list of available
  * tcm_vhost_tpg with an active struct tcm_vhost_nexus
+ *
+ *  The lock nesting rule is:
+ *tcm_vhost_mutex -> vs->dev.mutex -> tpg->tv_tpg_mutex -> vq->mutex
  */
 static int vhost_scsi_set_endpoint(
struct vhost_scsi *vs,
@@ -820,26 +823,27 @@ static int vhost_scsi_set_endpoint(
int index, ret, i, len;
bool match = false;
 
+   mutex_lock(&tcm_vhost_mutex);
mutex_lock(&vs->dev.mutex);
+
/* Verify that ring has been setup correctly. */
for (index = 0; index < vs->dev.nvqs; ++index) {
/* Verify that ring has been setup correctly. */
if (!vhost_vq_access_ok(&vs->vqs[index])) {
-   mutex_unlock(&vs->dev.mutex);
-   return -EFAULT;
+   ret = -EFAULT;
+   goto out;
}
}
 
len = sizeof(vs_tpg[0]) * VHOST_SCSI_MAX_TARGET;
vs_tpg = kzalloc(len, GFP_KERNEL);
if (!vs_tpg) {
-   mutex_unlock(&vs->dev.mutex);
-   return -ENOMEM;
+   ret = -ENOMEM;
+   goto out;
}
if (vs->vs_tpg)
memcpy(vs_tpg, vs->vs_tpg, len);
 
-   mutex_lock(&tcm_vhost_mutex);
list_for_each_entry(tv_tpg, &tcm_vhost_list, tv_tpg_list) {
mutex_lock(&tv_tpg->tv_tpg_mutex);
if (!tv_tpg->tpg_nexus) {
@@ -854,11 +858,10 @@ static int vhost_scsi_set_endpoint(
 
if (!strcmp(tv_tport->tport_name, t->vhost_wwpn)) {
if (vs->vs_tpg && vs->vs_tpg[tv_tpg->tport_tpgt]) {
-   mutex_unlock(&tv_tpg->tv_tpg_mutex);
-   mutex_unlock(&tcm_vhost_mutex);
-   mutex_unlock(&vs->dev.mutex);
kfree(vs_tpg);
-   return -EEXIST;
+   mutex_unlock(&tv_tpg->tv_tpg_mutex);
+   ret = -EEXIST;
+   goto out;
}
tv_tpg->tv_tpg_vhost_count++;
vs_tpg[tv_tpg->tport_tpgt] = tv_tpg;
@@ -867,7 +870,6 @@ static int vhost_scsi_set_endpoint(
}
mutex_unlock(&tv_tpg->tv_tpg_mutex);
}
-   mutex_unlock(&tcm_vhost_mutex);
 
if (match) {
memcpy(vs->vs_vhost_wwpn, t->vhost_wwpn,
@@ -893,7 +895,9 @@ static int vhost_scsi_set_endpoint(
kfree(vs->vs_tpg);
vs->vs_tpg = vs_tpg;
 
+out:
mutex_unlock(&vs->dev.mutex);
+   mutex_unlock(&tcm_vhost_mutex);
return ret;
 }
 
@@ -908,6 +912,7 @@ static int vhost_scsi_clear_endpoint(
int index, ret, i;
u8 target;
 
+   mutex_lock(&tcm_vhost_mutex);
mutex_lock(&vs->dev.mutex);
/* Verify that ring has been setup correctly. */
for (index = 0; index < vs->dev.nvqs; ++index) {
@@ -918,8 +923,8 @@ static int vhost_scsi_clear_endpoint(
}
 
if (!vs->vs_tpg) {
-   mutex_unlock(&vs->dev.mutex);
-   return 0;
+   ret = 0;
+   goto err_dev;
}
 
for (i = 0; i < VHOST_SCSI_MAX_TARGET; i++) {
@@ -965,13 +970,14 @@ static int vhost_scsi_clear_endpoint(
kfree(vs->vs_tpg);
vs->vs_tpg = NULL;
mutex_unlock(&vs->dev.mutex);
-
+   mutex_unlock(&tcm_vhost_mutex);
return 0;
 
 err_tpg:
mutex_unlock(&tv_tpg->tv_tpg_mutex);
 err_dev:
mutex_unlock(&vs->dev.mutex);
+   mutex_unlock(&tcm_vhost_mutex);
return ret;
 }
 
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v11 0/4] tcm_vhost hotplug

2013-04-25 Thread Asias He

Changes in v11
- Drop change log histroy in commit log

Changes in v10
- Drop comments about lun
- Add Enable VIRTIO_SCSI_F_HOTPLUG to this series

Changes in v9
- Drop tcm_vhost_check_feature
- Add Refactor the lock nesting rule to this sereis

Asias He (4):
  tcm_vhost: Refactor the lock nesting rule
  tcm_vhost: Add hotplug/hotunplug support
  tcm_vhost: Add ioctl to get and set events missed flag
  tcm_vhost: Enable VIRTIO_SCSI_F_HOTPLUG

 drivers/vhost/tcm_vhost.c | 262 +++---
 drivers/vhost/tcm_vhost.h |  13 +++
 2 files changed, 259 insertions(+), 16 deletions(-)

-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 0/7] KVM: irqfd generalization prepare patch set

2013-04-25 Thread Gleb Natapov

On Wed, Apr 24, 2013 at 01:20:31PM +0300, Gleb Natapov wrote:
> On Tue, Apr 16, 2013 at 07:26:08PM +0200, Alexander Graf wrote:
> > The concept of an irqfd and interrupt routing are nothing particularly tied
> > into the IOAPIC implementation. In fact, most of the code already is 
> > perfectly
> > generic.
> > 
> > This patch set decouples most bits of the existing irqchip and irqfd
> > implementation to make it reusable for non-IOAPIC platforms, like the PPC 
> > MPIC.
> > 
> > I also have a patch that implements working irqfd support on top of these,
> > but that requires the in-kernel MPIC implementation to go upstream first, so
> > I'm holding off on it until we settled everything there, so the concept
> > certainly does work.
> > 
> > Alex
> > 
> Nice cleanup, thanks! Should expect a new series with "ifdef
> kvm_irqchip" and ia64 compilation fixed. The fixes are minor enough for
> me to fix them while applying.
> 
Actually the series does not apply any more and has to be rebased on top of the
current queue.

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v10 2/4] tcm_vhost: Add hotplug/hotunplug support

2013-04-25 Thread Michael S. Tsirkin

On Thu, Apr 25, 2013 at 03:14:11PM +0800, Asias He wrote:
> MST wants one without change history in commit log.

So post v11, and add changes since v9 and since v10
after ---.
Make life easy for maintainers please.

> >From 8996c9464fae1f28d0bd729677a3917d204990ec Mon Sep 17 00:00:00 2001
> From: Asias He 
> Date: Thu, 25 Apr 2013 09:51:26 +0800
> Subject: [PATCH v10 2/4] tcm_vhost: Add hotplug/hotunplug support
> 
> In commit 365a7150094 ([SCSI] virtio-scsi: hotplug support for
> virtio-scsi), hotplug support is added to virtio-scsi.
> 
> This patch adds hotplug and hotunplug support to tcm_vhost.
> 
> You can create or delete a LUN in targetcli to hotplug or hotunplug a
> LUN in guest.
> 
> Signed-off-by: Asias He 
> Reviewed-by: Stefan Hajnoczi 
> ---
> 
> Changes in v8:
> - Use vq->mutex for event
> - Drop tcm_vhost: Add helper to check if endpoint is setup
> - Rename vs_events_dropped to vs_events_missed
> - Init lun[] explicitly
> 
> Changes in v7:
> - Add vhost_work_flush for vs->vs_event_work to this series
> 
> Changes in v6:
> - Pass tcm_vhost_evt to tcm_vhost_do_evt_work
> 
> Changes in v5:
> - Switch to int from u64 to vs_events_nr
> - Set s->vs_events_dropped flag in tcm_vhost_allocate_evt
> - Do not nest dev mutex within vq mutex
> - Use vs_events_lock to protect vs_events_dropped and vs_events_nr
> - Rebase to target/master
> 
> Changes in v4:
> - Drop tcm_vhost_check_endpoint in tcm_vhost_send_evt
> - Add tcm_vhost_check_endpoint in vhost_scsi_evt_handle_kick
> 
> Changes in v3:
> - Separate the bug fix to another thread
> 
> Changes in v2:
> - Remove code duplication in tcm_vhost_{hotplug,hotunplug}
> - Fix racing of vs_events_nr
> - Add flush fix patch to this series
> 
>  drivers/vhost/tcm_vhost.c | 210 
> +-
>  drivers/vhost/tcm_vhost.h |  10 +++
>  2 files changed, 218 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
> index 822cd1f..5340fd7 100644
> --- a/drivers/vhost/tcm_vhost.c
> +++ b/drivers/vhost/tcm_vhost.c
> @@ -71,6 +71,7 @@ enum {
>  
>  #define VHOST_SCSI_MAX_TARGET256
>  #define VHOST_SCSI_MAX_VQ128
> +#define VHOST_SCSI_MAX_EVENT 128
>  
>  struct vhost_scsi {
>   /* Protected by vhost_scsi->dev.mutex */
> @@ -82,6 +83,12 @@ struct vhost_scsi {
>  
>   struct vhost_work vs_completion_work; /* cmd completion work item */
>   struct llist_head vs_completion_list; /* cmd completion queue */
> +
> + struct vhost_work vs_event_work; /* evt injection work item */
> + struct llist_head vs_event_list; /* evt injection queue */
> +
> + bool vs_events_missed; /* any missed events, protected by vq->mutex */
> + int vs_events_nr; /* num of pending events, protected by vq->mutex */
>  };
>  
>  /* Local pointer to allocated TCM configfs fabric module */
> @@ -349,6 +356,37 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
>   return 0;
>  }
>  
> +static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt 
> *evt)
> +{
> + vs->vs_events_nr--;
> + kfree(evt);
> +}
> +
> +static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
> + u32 event, u32 reason)
> +{
> + struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
> + struct tcm_vhost_evt *evt;
> +
> + if (vs->vs_events_nr > VHOST_SCSI_MAX_EVENT) {
> + vs->vs_events_missed = true;
> + return NULL;
> + }
> +
> + evt = kzalloc(sizeof(*evt), GFP_KERNEL);
> + if (!evt) {
> + vq_err(vq, "Failed to allocate tcm_vhost_evt\n");
> + vs->vs_events_missed = true;
> + return NULL;
> + }
> +
> + evt->event.event = event;
> + evt->event.reason = reason;
> + vs->vs_events_nr++;
> +
> + return evt;
> +}
> +
>  static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
>  {
>   struct se_cmd *se_cmd = &tv_cmd->tvc_se_cmd;
> @@ -367,6 +405,75 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd 
> *tv_cmd)
>   kfree(tv_cmd);
>  }
>  
> +static void tcm_vhost_do_evt_work(struct vhost_scsi *vs,
> + struct tcm_vhost_evt *evt)
> +{
> + struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
> + struct virtio_scsi_event *event = &evt->event;
> + struct virtio_scsi_event __user *eventp;
> + unsigned out, in;
> + int head, ret;
> +
> + if (!vq->private_data) {
> + vs->vs_events_missed = true;
> + return;
> + }
> +
> +again:
> + vhost_disable_notify(&vs->dev, vq);
> + head = vhost_get_vq_desc(&vs->dev, vq, vq->iov,
> + ARRAY_SIZE(vq->iov), &out, &in,
> + NULL, NULL);
> + if (head < 0) {
> + vs->vs_events_missed = true;
> + return;
> + }
> + if (head == vq->num) {
> + if (vhost_enable_notify(&vs->dev, vq))
> + goto again;
> + vs->vs_events_missed = true;
> +

Re: [PATCH v10 2/4] tcm_vhost: Add hotplug/hotunplug support

2013-04-25 Thread Asias He

MST wants one without change history in commit log.

>From 8996c9464fae1f28d0bd729677a3917d204990ec Mon Sep 17 00:00:00 2001
From: Asias He 
Date: Thu, 25 Apr 2013 09:51:26 +0800
Subject: [PATCH v10 2/4] tcm_vhost: Add hotplug/hotunplug support

In commit 365a7150094 ([SCSI] virtio-scsi: hotplug support for
virtio-scsi), hotplug support is added to virtio-scsi.

This patch adds hotplug and hotunplug support to tcm_vhost.

You can create or delete a LUN in targetcli to hotplug or hotunplug a
LUN in guest.

Signed-off-by: Asias He 
Reviewed-by: Stefan Hajnoczi 
---

Changes in v8:
- Use vq->mutex for event
- Drop tcm_vhost: Add helper to check if endpoint is setup
- Rename vs_events_dropped to vs_events_missed
- Init lun[] explicitly

Changes in v7:
- Add vhost_work_flush for vs->vs_event_work to this series

Changes in v6:
- Pass tcm_vhost_evt to tcm_vhost_do_evt_work

Changes in v5:
- Switch to int from u64 to vs_events_nr
- Set s->vs_events_dropped flag in tcm_vhost_allocate_evt
- Do not nest dev mutex within vq mutex
- Use vs_events_lock to protect vs_events_dropped and vs_events_nr
- Rebase to target/master

Changes in v4:
- Drop tcm_vhost_check_endpoint in tcm_vhost_send_evt
- Add tcm_vhost_check_endpoint in vhost_scsi_evt_handle_kick

Changes in v3:
- Separate the bug fix to another thread

Changes in v2:
- Remove code duplication in tcm_vhost_{hotplug,hotunplug}
- Fix racing of vs_events_nr
- Add flush fix patch to this series

 drivers/vhost/tcm_vhost.c | 210 +-
 drivers/vhost/tcm_vhost.h |  10 +++
 2 files changed, 218 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
index 822cd1f..5340fd7 100644
--- a/drivers/vhost/tcm_vhost.c
+++ b/drivers/vhost/tcm_vhost.c
@@ -71,6 +71,7 @@ enum {
 
 #define VHOST_SCSI_MAX_TARGET  256
 #define VHOST_SCSI_MAX_VQ  128
+#define VHOST_SCSI_MAX_EVENT   128
 
 struct vhost_scsi {
/* Protected by vhost_scsi->dev.mutex */
@@ -82,6 +83,12 @@ struct vhost_scsi {
 
struct vhost_work vs_completion_work; /* cmd completion work item */
struct llist_head vs_completion_list; /* cmd completion queue */
+
+   struct vhost_work vs_event_work; /* evt injection work item */
+   struct llist_head vs_event_list; /* evt injection queue */
+
+   bool vs_events_missed; /* any missed events, protected by vq->mutex */
+   int vs_events_nr; /* num of pending events, protected by vq->mutex */
 };
 
 /* Local pointer to allocated TCM configfs fabric module */
@@ -349,6 +356,37 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
return 0;
 }
 
+static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt 
*evt)
+{
+   vs->vs_events_nr--;
+   kfree(evt);
+}
+
+static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
+   u32 event, u32 reason)
+{
+   struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
+   struct tcm_vhost_evt *evt;
+
+   if (vs->vs_events_nr > VHOST_SCSI_MAX_EVENT) {
+   vs->vs_events_missed = true;
+   return NULL;
+   }
+
+   evt = kzalloc(sizeof(*evt), GFP_KERNEL);
+   if (!evt) {
+   vq_err(vq, "Failed to allocate tcm_vhost_evt\n");
+   vs->vs_events_missed = true;
+   return NULL;
+   }
+
+   evt->event.event = event;
+   evt->event.reason = reason;
+   vs->vs_events_nr++;
+
+   return evt;
+}
+
 static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
 {
struct se_cmd *se_cmd = &tv_cmd->tvc_se_cmd;
@@ -367,6 +405,75 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd 
*tv_cmd)
kfree(tv_cmd);
 }
 
+static void tcm_vhost_do_evt_work(struct vhost_scsi *vs,
+   struct tcm_vhost_evt *evt)
+{
+   struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
+   struct virtio_scsi_event *event = &evt->event;
+   struct virtio_scsi_event __user *eventp;
+   unsigned out, in;
+   int head, ret;
+
+   if (!vq->private_data) {
+   vs->vs_events_missed = true;
+   return;
+   }
+
+again:
+   vhost_disable_notify(&vs->dev, vq);
+   head = vhost_get_vq_desc(&vs->dev, vq, vq->iov,
+   ARRAY_SIZE(vq->iov), &out, &in,
+   NULL, NULL);
+   if (head < 0) {
+   vs->vs_events_missed = true;
+   return;
+   }
+   if (head == vq->num) {
+   if (vhost_enable_notify(&vs->dev, vq))
+   goto again;
+   vs->vs_events_missed = true;
+   return;
+   }
+
+   if ((vq->iov[out].iov_len != sizeof(struct virtio_scsi_event))) {
+   vq_err(vq, "Expecting virtio_scsi_event, got %zu bytes\n",
+   vq->iov[out].iov_len);
+   vs->vs_events_missed = true;
+   return;
+   }
+
+   if (vs->vs_events_missed) {
+

97 matches

Mail list logo