[PATCH v2 7/8] KVM: VMX: clean up bit operation on SECONDARY_VM_EXEC_CONTROL

2015-09-08 Thread Xiao Guangrong
Use vmcs_set_bits() and vmcs_clear_bits() to clean up the code

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/vmx.c | 31 ---
 1 file changed, 8 insertions(+), 23 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5a074d0..f18f744 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6643,7 +6643,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu 
*vcpu)
 
 static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
 {
-   u32 exec_control;
if (vmx->nested.current_vmptr == -1ull)
return;
 
@@ -6656,9 +6655,8 @@ static inline void nested_release_vmcs12(struct vcpu_vmx 
*vmx)
   they were modified */
copy_shadow_to_vmcs12(vmx);
vmx->nested.sync_shadow_vmcs = false;
-   exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-   exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
-   vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+   vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+   SECONDARY_EXEC_SHADOW_VMCS);
vmcs_write64(VMCS_LINK_POINTER, -1ull);
}
vmx->nested.posted_intr_nv = -1;
@@ -7054,7 +7052,6 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
gpa_t vmptr;
-   u32 exec_control;
 
if (!nested_vmx_check_permission(vcpu))
return 1;
@@ -7086,9 +7083,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
vmx->nested.current_vmcs12 = new_vmcs12;
vmx->nested.current_vmcs12_page = page;
if (enable_shadow_vmcs) {
-   exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-   exec_control |= SECONDARY_EXEC_SHADOW_VMCS;
-   vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+   vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_SHADOW_VMCS);
vmcs_write64(VMCS_LINK_POINTER,
 __pa(vmx->nested.current_shadow_vmcs));
vmx->nested.sync_shadow_vmcs = true;
@@ -7598,7 +7594,6 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 
*info1, u64 *info2)
 static int vmx_enable_pml(struct vcpu_vmx *vmx)
 {
struct page *pml_pg;
-   u32 exec_control;
 
pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!pml_pg)
@@ -7609,24 +7604,18 @@ static int vmx_enable_pml(struct vcpu_vmx *vmx)
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
 
-   exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-   exec_control |= SECONDARY_EXEC_ENABLE_PML;
-   vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+   vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_ENABLE_PML);
 
return 0;
 }
 
 static void vmx_disable_pml(struct vcpu_vmx *vmx)
 {
-   u32 exec_control;
-
ASSERT(vmx->pml_pg);
__free_page(vmx->pml_pg);
vmx->pml_pg = NULL;
 
-   exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-   exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
-   vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+   vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_ENABLE_PML);
 }
 
 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
@@ -8699,12 +8688,8 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
best->ebx &= ~bit(X86_FEATURE_INVPCID);
}
 
-   if (clear_exe_ctrl) {
-   u32 exec_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-
-   exec_ctl &= ~clear_exe_ctrl;
-   vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_ctl);
-   }
+   if (clear_exe_ctrl)
+   vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, clear_exe_ctrl);
 
if (!guest_cpuid_has_pcommit(vcpu) && nested)
vmx->nested.nested_vmx_secondary_ctls_high &=
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 5/8] KVM: VMX: simplify invpcid handling in vmx_cpuid_update()

2015-09-08 Thread Xiao Guangrong
If vmx_invpcid_supported() is true, second execution control
filed must be supported and SECONDARY_EXEC_ENABLE_INVPCID
must have already been set in current vmcs by
vmx_secondary_exec_control()

If vmx_invpcid_supported() is false, no need to clear
SECONDARY_EXEC_ENABLE_INVPCID

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/vmx.c | 17 +
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bcc69de..97e3340 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8695,19 +8695,12 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
/* Exposing INVPCID only when PCID is exposed */
best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
if (vmx_invpcid_supported() &&
-   best && (best->ebx & bit(X86_FEATURE_INVPCID)) &&
-   guest_cpuid_has_pcid(vcpu)) {
+   (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) ||
+   !guest_cpuid_has_pcid(vcpu))) {
exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-   exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;
-   vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
-exec_control);
-   } else {
-   if (cpu_has_secondary_exec_ctrls()) {
-   exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-   exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
-   vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
-exec_control);
-   }
+   exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
+   vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+
if (best)
best->ebx &= ~bit(X86_FEATURE_INVPCID);
}
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 4/8] KVM: VMX: simplify rdtscp handling in vmx_cpuid_update()

2015-09-08 Thread Xiao Guangrong
if vmx_rdtscp_supported() is true SECONDARY_EXEC_RDTSCP must
have already been set in current vmcs by
vmx_secondary_exec_control()

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/vmx.c | 17 -
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 61d44b0..bcc69de 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8678,16 +8678,15 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
vmx->rdtscp_enabled = false;
if (vmx_rdtscp_supported()) {
exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-   if (exec_control & SECONDARY_EXEC_RDTSCP) {
-   best = kvm_find_cpuid_entry(vcpu, 0x8001, 0);
-   if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
-   vmx->rdtscp_enabled = true;
-   else {
-   exec_control &= ~SECONDARY_EXEC_RDTSCP;
-   vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
-   exec_control);
-   }
+   best = kvm_find_cpuid_entry(vcpu, 0x8001, 0);
+   if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
+   vmx->rdtscp_enabled = true;
+   else {
+   exec_control &= ~SECONDARY_EXEC_RDTSCP;
+   vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+   exec_control);
}
+
if (nested && !vmx->rdtscp_enabled)
vmx->nested.nested_vmx_secondary_ctls_high &=
~SECONDARY_EXEC_RDTSCP;
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 6/8] KVM: VMX: unify SECONDARY_VM_EXEC_CONTROL update

2015-09-08 Thread Xiao Guangrong
Unify the update in vmx_cpuid_update()

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/vmx.c | 21 +++--
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 97e3340..5a074d0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8673,19 +8673,15 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 {
struct kvm_cpuid_entry2 *best;
struct vcpu_vmx *vmx = to_vmx(vcpu);
-   u32 exec_control;
+   u32 clear_exe_ctrl = 0;
 
vmx->rdtscp_enabled = false;
if (vmx_rdtscp_supported()) {
-   exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
best = kvm_find_cpuid_entry(vcpu, 0x8001, 0);
if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
vmx->rdtscp_enabled = true;
-   else {
-   exec_control &= ~SECONDARY_EXEC_RDTSCP;
-   vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
-   exec_control);
-   }
+   else
+   clear_exe_ctrl |= SECONDARY_EXEC_RDTSCP;
 
if (nested && !vmx->rdtscp_enabled)
vmx->nested.nested_vmx_secondary_ctls_high &=
@@ -8697,14 +8693,19 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
if (vmx_invpcid_supported() &&
(!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) ||
!guest_cpuid_has_pcid(vcpu))) {
-   exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-   exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
-   vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+   clear_exe_ctrl |= SECONDARY_EXEC_ENABLE_INVPCID;
 
if (best)
best->ebx &= ~bit(X86_FEATURE_INVPCID);
}
 
+   if (clear_exe_ctrl) {
+   u32 exec_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+
+   exec_ctl &= ~clear_exe_ctrl;
+   vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_ctl);
+   }
+
if (!guest_cpuid_has_pcommit(vcpu) && nested)
vmx->nested.nested_vmx_secondary_ctls_high &=
~SECONDARY_EXEC_PCOMMIT;
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 8/8] KVM: VMX: drop rdtscp_enabled field

2015-09-08 Thread Xiao Guangrong
Check cpuid bit instead of it

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/cpuid.h |  8 
 arch/x86/kvm/vmx.c   | 19 ++-
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index aed7bfe..d434ee9 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -141,4 +141,12 @@ static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu 
*vcpu)
best = kvm_find_cpuid_entry(vcpu, 7, 0);
return best && (best->ebx & bit(X86_FEATURE_PCOMMIT));
 }
+
+static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
+{
+   struct kvm_cpuid_entry2 *best;
+
+   best = kvm_find_cpuid_entry(vcpu, 0x8001, 0);
+   return best && (best->edx & bit(X86_FEATURE_RDTSCP));
+}
 #endif
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f18f744..2e98e6d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -532,8 +532,6 @@ struct vcpu_vmx {
s64 vnmi_blocked_time;
u32 exit_reason;
 
-   bool rdtscp_enabled;
-
/* Posted interrupt descriptor */
struct pi_desc pi_desc;
 
@@ -2207,7 +2205,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
if (index >= 0)
move_msr_up(vmx, index, save_nmsrs++);
index = __find_msr_index(vmx, MSR_TSC_AUX);
-   if (index >= 0 && vmx->rdtscp_enabled)
+   if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu))
move_msr_up(vmx, index, save_nmsrs++);
/*
 * MSR_STAR is only needed on long mode guests, and only
@@ -2674,7 +2672,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
msr_info->data = vcpu->arch.ia32_xss;
break;
case MSR_TSC_AUX:
-   if (!to_vmx(vcpu)->rdtscp_enabled)
+   if (!guest_cpuid_has_rdtscp(vcpu))
return 1;
/* Otherwise falls through */
default:
@@ -2780,7 +2778,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
break;
case MSR_TSC_AUX:
-   if (!vmx->rdtscp_enabled)
+   if (!guest_cpuid_has_rdtscp(vcpu))
return 1;
/* Check reserved bit, higher 32 bits should be zero */
if ((data >> 32) != 0)
@@ -8664,15 +8662,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 clear_exe_ctrl = 0;
 
-   vmx->rdtscp_enabled = false;
-   if (vmx_rdtscp_supported()) {
-   best = kvm_find_cpuid_entry(vcpu, 0x8001, 0);
-   if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
-   vmx->rdtscp_enabled = true;
-   else
-   clear_exe_ctrl |= SECONDARY_EXEC_RDTSCP;
+   if (vmx_rdtscp_supported() && !guest_cpuid_has_rdtscp(vcpu)) {
+   clear_exe_ctrl |= SECONDARY_EXEC_RDTSCP;
 
-   if (nested && !vmx->rdtscp_enabled)
+   if (nested)
vmx->nested.nested_vmx_secondary_ctls_high &=
~SECONDARY_EXEC_RDTSCP;
}
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 3/8] KVM: VMX: drop rdtscp_enabled check in prepare_vmcs02()

2015-09-08 Thread Xiao Guangrong
SECONDARY_EXEC_RDTSCP set for L2 guest comes from vmcs12

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/vmx.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 073cbc8..61d44b0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9323,8 +9323,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct 
vmcs12 *vmcs12)
 
if (cpu_has_secondary_exec_ctrls()) {
exec_control = vmx_secondary_exec_control(vmx);
-   if (!vmx->rdtscp_enabled)
-   exec_control &= ~SECONDARY_EXEC_RDTSCP;
+
/* Take the following fields only from vmcs12 */
exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
  SECONDARY_EXEC_RDTSCP |
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 0/8] KVM: x86: enable cflushopt/clwb/pcommit and simplify code

2015-09-08 Thread Xiao Guangrong
Changelog:
Thanks for Paolo's review, there are the changes in v2:
- use WARN_ON(1) instead of BUG() if PCOMMIT-exit happend for L1 guest
- drop set_clear_2nd_exec_ctrl() and use vmcs_{set,clear}_bits  instead
- improve commit log and adjust code style

This pachset enables clfushopt, clwb and pcommit instructions for guest which
are used by NVDIMM.

The specification locates at:
https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf

patch 1 and patch 2 enable these three instructions for guest and other patches
simplify current VMX code

Xiao Guangrong (8):
  KVM: x86: allow guest to use cflushopt and clwb
  KVM: x86: add pcommit support
  KVM: VMX: drop rdtscp_enabled check in prepare_vmcs02()
  KVM: VMX: simplify rdtscp handling in vmx_cpuid_update()
  KVM: VMX: simplify invpcid handling in vmx_cpuid_update()
  KVM: VMX: unify SECONDARY_VM_EXEC_CONTROL update
  KVM: VMX: clean up bit operation on SECONDARY_VM_EXEC_CONTROL
  KVM: VMX: drop rdtscp_enabled field

 arch/x86/include/asm/vmx.h  |   2 +-
 arch/x86/include/uapi/asm/vmx.h |   4 +-
 arch/x86/kvm/cpuid.c|   2 +-
 arch/x86/kvm/cpuid.h|  16 +++
 arch/x86/kvm/vmx.c  | 103 ++--
 5 files changed, 67 insertions(+), 60 deletions(-)

-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/8] KVM: x86: add pcommit support

2015-09-08 Thread Xiao Guangrong
Pass PCOMMIT CPU feature to guest to enable PCOMMIT instruction

Currently we do not catch pcommit instruction for L1 guest and
allow L1 to catch this instruction for L2 if, as required by the spec,
L1 can enumerate the PCOMMIT instruction via CPUID:
| IA32_VMX_PROCBASED_CTLS2[53] (which enumerates support for the
| 1-setting of PCOMMIT exiting) is always the same as
| CPUID.07H:EBX.PCOMMIT[bit 22]. Thus, software can set PCOMMIT exiting
| to 1 if and only if the PCOMMIT instruction is enumerated via CPUID

The spec can be found at
https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf

Signed-off-by: Xiao Guangrong 
---
 arch/x86/include/asm/vmx.h  |  2 +-
 arch/x86/include/uapi/asm/vmx.h |  4 +++-
 arch/x86/kvm/cpuid.c|  2 +-
 arch/x86/kvm/cpuid.h|  8 
 arch/x86/kvm/vmx.c  | 29 -
 5 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 448b7ca..d25f32a 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -72,7 +72,7 @@
 #define SECONDARY_EXEC_SHADOW_VMCS  0x4000
 #define SECONDARY_EXEC_ENABLE_PML   0x0002
 #define SECONDARY_EXEC_XSAVES  0x0010
-
+#define SECONDARY_EXEC_PCOMMIT 0x0020
 
 #define PIN_BASED_EXT_INTR_MASK 0x0001
 #define PIN_BASED_NMI_EXITING   0x0008
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 37fee27..5b15d94 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -78,6 +78,7 @@
 #define EXIT_REASON_PML_FULL62
 #define EXIT_REASON_XSAVES  63
 #define EXIT_REASON_XRSTORS 64
+#define EXIT_REASON_PCOMMIT 65
 
 #define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@@ -126,7 +127,8 @@
{ EXIT_REASON_INVVPID,   "INVVPID" }, \
{ EXIT_REASON_INVPCID,   "INVPCID" }, \
{ EXIT_REASON_XSAVES,"XSAVES" }, \
-   { EXIT_REASON_XRSTORS,   "XRSTORS" }
+   { EXIT_REASON_XRSTORS,   "XRSTORS" }, \
+   { EXIT_REASON_PCOMMIT,   "PCOMMIT" }
 
 #define VMX_ABORT_SAVE_GUEST_MSR_FAIL1
 #define VMX_ABORT_LOAD_HOST_MSR_FAIL 4
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 962fc7d..faeb0b3 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -348,7 +348,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
-   F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB);
+   F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT);
 
/* cpuid 0xD.1.eax */
const u32 kvm_supported_word10_x86_features =
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index dd05b9c..aed7bfe 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -133,4 +133,12 @@ static inline bool guest_cpuid_has_mpx(struct kvm_vcpu 
*vcpu)
best = kvm_find_cpuid_entry(vcpu, 7, 0);
return best && (best->ebx & bit(X86_FEATURE_MPX));
 }
+
+static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu)
+{
+   struct kvm_cpuid_entry2 *best;
+
+   best = kvm_find_cpuid_entry(vcpu, 7, 0);
+   return best && (best->ebx & bit(X86_FEATURE_PCOMMIT));
+}
 #endif
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index da1590e..073cbc8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2474,7 +2474,8 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx 
*vmx)
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_WBINVD_EXITING |
-   SECONDARY_EXEC_XSAVES;
+   SECONDARY_EXEC_XSAVES |
+   SECONDARY_EXEC_PCOMMIT;
 
if (enable_ept) {
/* nested EPT: emulate EPT also to L1 */
@@ -3015,7 +3016,8 @@ static __init int setup_vmcs_config(struct vmcs_config 
*vmcs_conf)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_SHADOW_VMCS |
SECONDARY_EXEC_XSAVES |
-   SECONDARY_EXEC_ENABLE_PML;
+   SECONDARY_EXEC_ENABLE_PML |
+   SECONDARY_EXEC_PCOMMIT;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -4570,6 +4572,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx 
*vmx)
/* PML is enabled/disabled in creating/destorying vcp

[PATCH v2 1/8] KVM: x86: allow guest to use cflushopt and clwb

2015-09-08 Thread Xiao Guangrong
Pass these CPU features to guest to enable them in guest

They are needed by nvdimm drivers

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/cpuid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 2fbea25..962fc7d 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -348,7 +348,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
-   F(AVX512CD);
+   F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB);
 
/* cpuid 0xD.1.eax */
const u32 kvm_supported_word10_x86_features =
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/9] KVM: x86: add pcommit support

2015-09-08 Thread Paolo Bonzini


On 08/09/2015 16:17, Xiao Guangrong wrote:
> 
> BTW, the spec saied:
> 
> | IA32_VMX_PROCBASED_CTLS2[53] (which enumerates support for the
> 1-setting of “PCOMMIT exiting”) is
> | always the same as CPUID.07H:EBX.PCOMMIT[bit 22]. Thus, software can
> set “PCOMMIT exiting” to 1
> | if and only if the PCOMMIT instruction is enumerated via CPUID

Thanks.  Can you add it to the commit message ("allow L1 to catch this
instruction for L2 if, as required by the spec, L1 can enumerate the
PCOMMIT instruction via CPUID").

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 8/9] KVM: VMX: introduce set_clear_2nd_exec_ctrl()

2015-09-08 Thread Paolo Bonzini


On 08/09/2015 16:24, Xiao Guangrong wrote:
>>
>> The second argument is always true.
> 
> No...
> 
> There are 3 places calling this function with set=false:
> nested_release_vmcs12(), vmx_disable_pml() and
> vmx_cpuid_update()

You're right.  It's always constant---I don't know why I wrote it's
always true, and then suggested vmcs_clear32...

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


GOOD NEWS

2015-09-08 Thread
Dear Friend,

I Am Stani I Have A Project Work For You Kindly Reply Me With My E-mail For 
More Details.Thank You.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


GOOD NEWS

2015-09-08 Thread
Dear Friend,

I Am Stani I Have A Project Work For You Kindly Reply Me With My E-mail For 
More Details.Thank You.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 答复: I'm now looking into kvm-unit-tests and encounted with some problems.

2015-09-08 Thread Lucas Meneghel Rodrigues

Paolo,

I did try with unrestricted_guest=0, and I'm still getting the failure.

On Mon, Sep 7, 2015 at 9:32 AM, Paolo Bonzini  
wrote:



On 02/09/2015 05:33, Guoyanjuan wrote:

 Hi,  I found my code is old and I git the latest code from
 
https://git.kernel.org/pub/scm/virt/kvm/kvm-unit-tests.git,

 some problems are solved but one.

 when I run emulate unittest, it failed.

 command:
 qemu-kvm --enable-kvm -device pc-testdev -device
 isa-debug-exit,iobase=0xf4,iosize=0x4 -serial stdio -device
 pci-testdev -kernel x86/emulator.flat -vnc none

 logs:
 FAIL: mov null, %ss


Lucas Meneghel Rodrigues also reproduced this, it seems to be 
processor

dependent.  I haven't debugged it yet because it doesn't reproduce on
the two systems I've tested on (Ivy Bridge i7 and Haswell Xeon E5).

Can you please also try loading the kvm-intel module with the
"unrestricted_guest=0" parameter, and see if it also reproduce?

It might be a processor bug too.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 6/8] arm/arm64: KVM: Add forwarded physical interrupts documentation

2015-09-08 Thread Andre Przywara
Hi Eric,

thanks for you answer.

On 08/09/15 09:43, Eric Auger wrote:
> Hi Andre,
> On 09/07/2015 01:25 PM, Andre Przywara wrote:
>> Hi,
>>
>> firstly: this text is really great, thanks for coming up with that.
>> See below for some information I got from tracing the host which I
>> cannot make sense of
>>
>>
>> On 04/09/15 20:40, Christoffer Dall wrote:
>>> Forwarded physical interrupts on arm/arm64 is a tricky concept and the
>>> way we deal with them is not apparently easy to understand by reading
>>> various specs.
>>>
>>> Therefore, add a proper documentation file explaining the flow and
>>> rationale of the behavior of the vgic.
>>>
>>> Some of this text was contributed by Marc Zyngier and edited by me.
>>> Omissions and errors are all mine.
>>>
>>> Signed-off-by: Christoffer Dall 
>>> ---
>>>  Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt | 181 
>>> +
>>>  1 file changed, 181 insertions(+)
>>>  create mode 100644 Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt
>>>
>>> diff --git a/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt 
>>> b/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt
>>> new file mode 100644
>>> index 000..24b6f28
>>> --- /dev/null
>>> +++ b/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt
>>> @@ -0,0 +1,181 @@
>>> +KVM/ARM VGIC Forwarded Physical Interrupts
>>> +==
>>> +
>>> +The KVM/ARM code implements software support for the ARM Generic
>>> +Interrupt Controller's (GIC's) hardware support for virtualization by
>>> +allowing software to inject virtual interrupts to a VM, which the guest
>>> +OS sees as regular interrupts.  The code is famously known as the VGIC.
>>> +
>>> +Some of these virtual interrupts, however, correspond to physical
>>> +interrupts from real physical devices.  One example could be the
>>> +architected timer, which itself supports virtualization, and therefore
>>> +lets a guest OS program the hardware device directly to raise an
>>> +interrupt at some point in time.  When such an interrupt is raised, the
>>> +host OS initially handles the interrupt and must somehow signal this
>>> +event as a virtual interrupt to the guest.  Another example could be a
>>> +passthrough device, where the physical interrupts are initially handled
>>> +by the host, but the device driver for the device lives in the guest OS
>>> +and KVM must therefore somehow inject a virtual interrupt on behalf of
>>> +the physical one to the guest OS.
>>> +
>>> +These virtual interrupts corresponding to a physical interrupt on the
>>> +host are called forwarded physical interrupts, but are also sometimes
>>> +referred to as 'virtualized physical interrupts' and 'mapped interrupts'.
>>> +
>>> +Forwarded physical interrupts are handled slightly differently compared
>>> +to virtual interrupts generated purely by a software emulated device.
>>> +
>>> +
>>> +The HW bit
>>> +--
>>> +Virtual interrupts are signalled to the guest by programming the List
>>> +Registers (LRs) on the GIC before running a VCPU.  The LR is programmed
>>> +with the virtual IRQ number and the state of the interrupt (Pending,
>>> +Active, or Pending+Active).  When the guest ACKs and EOIs a virtual
>>> +interrupt, the LR state moves from Pending to Active, and finally to
>>> +inactive.
>>> +
>>> +The LRs include an extra bit, called the HW bit.  When this bit is set,
>>> +KVM must also program an additional field in the LR, the physical IRQ
>>> +number, to link the virtual with the physical IRQ.
>>> +
>>> +When the HW bit is set, KVM must EITHER set the Pending OR the Active
>>> +bit, never both at the same time.
>>> +
>>> +Setting the HW bit causes the hardware to deactivate the physical
>>> +interrupt on the physical distributor when the guest deactivates the
>>> +corresponding virtual interrupt.
>>> +
>>> +
>>> +Forwarded Physical Interrupts Life Cycle
>>> +
>>> +
>>> +The state of forwarded physical interrupts is managed in the following way:
>>> +
>>> +  - The physical interrupt is acked by the host, and becomes active on
>>> +the physical distributor (*).
>>> +  - KVM sets the LR.Pending bit, because this is the only way the GICV
>>> +interface is going to present it to the guest.
>>> +  - LR.Pending will stay set as long as the guest has not acked the 
>>> interrupt.
>>> +  - LR.Pending transitions to LR.Active on the guest read of the IAR, as
>>> +expected.
>>> +  - On guest EOI, the *physical distributor* active bit gets cleared,
>>> +but the LR.Active is left untouched (set).
>>
>> I tried hard in the last week, but couldn't confirm this. Tracing shows
>> the following pattern over and over (case 1):
>> (This is the kvm/kvm.git:queue branch from last week, so including the
>> mapped timer IRQ code. Tests were done on Juno and Midway)
>>
>> ...
>> 229.340171: kvm_exit: TRAP: HSR_EC: 0x0001 (WFx), PC: 0xffc98a64
>> 229.340324: kvm_exit: IRQ: HSR_EC: 0x0001 (WFx), PC: 0

Re: [PATCH 8/9] KVM: VMX: introduce set_clear_2nd_exec_ctrl()

2015-09-08 Thread Xiao Guangrong



On 09/07/2015 07:27 PM, Paolo Bonzini wrote:



On 21/08/2015 06:50, Xiao Guangrong wrote:


+static void set_clear_2nd_exec_ctrl(u32 ctrls, bool set)
+{
+   u32 exec_ctrl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+
+   if (set)
+   exec_ctrl |= ctrls;
+   else
+   exec_ctrl &= ~ctrls;
+
+   vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_ctrl);
+}


The second argument is always true.


No...

There are 3 places calling this function with set=false:
nested_release_vmcs12(), vmx_disable_pml() and
vmx_cpuid_update()



Do you have any plans for it?

Should we instead add functions like vmcs_or32 and vmcs_clear32?



Sounds good to me, will do it.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 6/9] KVM: VMX: simplify invpcid handling in vmx_cpuid_update()

2015-09-08 Thread Xiao Guangrong



On 09/07/2015 07:28 PM, Paolo Bonzini wrote:



On 21/08/2015 06:50, Xiao Guangrong wrote:

+   if (vmx_invpcid_supported() && (!best ||


Please start the "(" subexpression on a new line.



Okay, will fix.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/9] KVM: x86: add pcommit support

2015-09-08 Thread Xiao Guangrong



On 09/07/2015 07:18 PM, Paolo Bonzini wrote:



+static int handle_pcommit(struct kvm_vcpu *vcpu)
+{
+   /* we never catch pcommit instruct for L1 guest. */
+   BUG();


Please WARN instead.



Okay.


+   return 1;
+}
+
  /*
   * The exit handlers return 1 if the exit was handled fully and guest 
execution
   * may resume.  Otherwise they set the kvm_run parameter to indicate what 
needs
@@ -7258,6 +7269,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct 
kvm_vcpu *vcpu) = {
[EXIT_REASON_XSAVES]  = handle_xsaves,
[EXIT_REASON_XRSTORS] = handle_xrstors,
[EXIT_REASON_PML_FULL]= handle_pml_full,
+   [EXIT_REASON_PCOMMIT] = handle_pcommit,
  };

  static const int kvm_vmx_max_exit_handlers =
@@ -7559,6 +7571,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 * the XSS exit bitmap in vmcs12.
 */
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
+   case EXIT_REASON_PCOMMIT:
+   return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT);
default:
return true;
}
@@ -8688,6 +8702,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
if (best)
best->ebx &= ~bit(X86_FEATURE_INVPCID);
}
+
+   if (!guest_cpuid_has_pcommit(vcpu) && nested)
+   vmx->nested.nested_vmx_secondary_ctls_high &=
+   ~SECONDARY_EXEC_PCOMMIT;


Why is this needed?



If pcommit is not allowed in L1 guest, L1 is not allowed to intercept pcommit
for L2.

BTW, the spec saied:

| IA32_VMX_PROCBASED_CTLS2[53] (which enumerates support for the 1-setting of 
“PCOMMIT exiting”) is
| always the same as CPUID.07H:EBX.PCOMMIT[bit 22]. Thus, software can set 
“PCOMMIT exiting” to 1
| if and only if the PCOMMIT instruction is enumerated via CPUID
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 6/8] arm/arm64: KVM: Add forwarded physical interrupts documentation

2015-09-08 Thread Christoffer Dall
On Mon, Sep 07, 2015 at 12:25:27PM +0100, Andre Przywara wrote:
> Hi,
> 
> firstly: this text is really great, thanks for coming up with that.
> See below for some information I got from tracing the host which I
> cannot make sense of
> 
> 
> On 04/09/15 20:40, Christoffer Dall wrote:
> > Forwarded physical interrupts on arm/arm64 is a tricky concept and the
> > way we deal with them is not apparently easy to understand by reading
> > various specs.
> > 
> > Therefore, add a proper documentation file explaining the flow and
> > rationale of the behavior of the vgic.
> > 
> > Some of this text was contributed by Marc Zyngier and edited by me.
> > Omissions and errors are all mine.
> > 
> > Signed-off-by: Christoffer Dall 
> > ---
> >  Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt | 181 
> > +
> >  1 file changed, 181 insertions(+)
> >  create mode 100644 Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt
> > 
> > diff --git a/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt 
> > b/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt
> > new file mode 100644
> > index 000..24b6f28
> > --- /dev/null
> > +++ b/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt
> > @@ -0,0 +1,181 @@
> > +KVM/ARM VGIC Forwarded Physical Interrupts
> > +==
> > +
> > +The KVM/ARM code implements software support for the ARM Generic
> > +Interrupt Controller's (GIC's) hardware support for virtualization by
> > +allowing software to inject virtual interrupts to a VM, which the guest
> > +OS sees as regular interrupts.  The code is famously known as the VGIC.
> > +
> > +Some of these virtual interrupts, however, correspond to physical
> > +interrupts from real physical devices.  One example could be the
> > +architected timer, which itself supports virtualization, and therefore
> > +lets a guest OS program the hardware device directly to raise an
> > +interrupt at some point in time.  When such an interrupt is raised, the
> > +host OS initially handles the interrupt and must somehow signal this
> > +event as a virtual interrupt to the guest.  Another example could be a
> > +passthrough device, where the physical interrupts are initially handled
> > +by the host, but the device driver for the device lives in the guest OS
> > +and KVM must therefore somehow inject a virtual interrupt on behalf of
> > +the physical one to the guest OS.
> > +
> > +These virtual interrupts corresponding to a physical interrupt on the
> > +host are called forwarded physical interrupts, but are also sometimes
> > +referred to as 'virtualized physical interrupts' and 'mapped interrupts'.
> > +
> > +Forwarded physical interrupts are handled slightly differently compared
> > +to virtual interrupts generated purely by a software emulated device.
> > +
> > +
> > +The HW bit
> > +--
> > +Virtual interrupts are signalled to the guest by programming the List
> > +Registers (LRs) on the GIC before running a VCPU.  The LR is programmed
> > +with the virtual IRQ number and the state of the interrupt (Pending,
> > +Active, or Pending+Active).  When the guest ACKs and EOIs a virtual
> > +interrupt, the LR state moves from Pending to Active, and finally to
> > +inactive.
> > +
> > +The LRs include an extra bit, called the HW bit.  When this bit is set,
> > +KVM must also program an additional field in the LR, the physical IRQ
> > +number, to link the virtual with the physical IRQ.
> > +
> > +When the HW bit is set, KVM must EITHER set the Pending OR the Active
> > +bit, never both at the same time.
> > +
> > +Setting the HW bit causes the hardware to deactivate the physical
> > +interrupt on the physical distributor when the guest deactivates the
> > +corresponding virtual interrupt.
> > +
> > +
> > +Forwarded Physical Interrupts Life Cycle
> > +
> > +
> > +The state of forwarded physical interrupts is managed in the following way:
> > +
> > +  - The physical interrupt is acked by the host, and becomes active on
> > +the physical distributor (*).
> > +  - KVM sets the LR.Pending bit, because this is the only way the GICV
> > +interface is going to present it to the guest.
> > +  - LR.Pending will stay set as long as the guest has not acked the 
> > interrupt.
> > +  - LR.Pending transitions to LR.Active on the guest read of the IAR, as
> > +expected.
> > +  - On guest EOI, the *physical distributor* active bit gets cleared,
> > +but the LR.Active is left untouched (set).
> 
> I tried hard in the last week, but couldn't confirm this. Tracing shows
> the following pattern over and over (case 1):
> (This is the kvm/kvm.git:queue branch from last week, so including the
> mapped timer IRQ code. Tests were done on Juno and Midway)
> 
> ...
> 229.340171: kvm_exit: TRAP: HSR_EC: 0x0001 (WFx), PC: 0xffc98a64
> 229.340324: kvm_exit: IRQ: HSR_EC: 0x0001 (WFx), PC: 0xffc0001c63a0
> 229.340428: kvm_exit: TRAP: HSR_EC: 0x0024 (DABT_LOW), 

Re: [Qemu-devel] [PATCH 19/23] userfaultfd: activate syscall

2015-09-08 Thread Dr. David Alan Gilbert
* Bharata B Rao (bhar...@linux.vnet.ibm.com) wrote:
> On Tue, Sep 08, 2015 at 01:46:52PM +0100, Dr. David Alan Gilbert wrote:
> > * Bharata B Rao (bhar...@linux.vnet.ibm.com) wrote:
> > > On Tue, Sep 08, 2015 at 09:59:47AM +0100, Dr. David Alan Gilbert wrote:
> > > > * Bharata B Rao (bhar...@linux.vnet.ibm.com) wrote:
> > > > > In fact I had successfully done postcopy migration of sPAPR guest with
> > > > > this setup.
> > > > 
> > > > Interesting - I'd not got that far myself on power; I was hitting a 
> > > > problem
> > > > loading htab ( htab_load() bad index 2113929216 (14848+0 entries) in 
> > > > htab stream (htab_shift=25) )
> > > > 
> > > > Did you have to make any changes to the qemu code to get that happy?
> > > 
> > > I should have mentioned that I tried only QEMU driven migration within
> > > the same host using wp3-postcopy branch of your tree. I don't see the
> > > above issue.
> > > 
> > > (qemu) info migrate
> > > capabilities: xbzrle: off rdma-pin-all: off auto-converge: off 
> > > zero-blocks: off compress: off x-postcopy-ram: on 
> > > Migration status: completed
> > > total time: 39432 milliseconds
> > > downtime: 162 milliseconds
> > > setup: 14 milliseconds
> > > transferred ram: 1297209 kbytes
> > > throughput: 270.72 mbps
> > > remaining ram: 0 kbytes
> > > total ram: 4194560 kbytes
> > > duplicate: 734015 pages
> > > skipped: 0 pages
> > > normal: 318469 pages
> > > normal bytes: 1273876 kbytes
> > > dirty sync count: 4
> > > 
> > > I will try migration between different hosts soon and check.
> > 
> > I hit that on the same host; are you sure you've switched into postcopy 
> > mode;
> > i.e. issued a migrate_start_postcopy before the end of migration?
> 
> Sorry I was following your discussion with Li in this thread
> 
> https://www.marc.info/?l=qemu-devel&m=143035620026744&w=4
> 
> and it wasn't obvious to me that anything apart from turning on the
> x-postcopy-ram capability was required :(

OK.

> So I do see the problem now.
> 
> At the source
> -
> Error reading data from KVM HTAB fd: Bad file descriptor
> Segmentation fault
> 
> At the target
> -
> htab_load() bad index 2113929216 (14336+0 entries) in htab stream 
> (htab_shift=25)
> qemu-system-ppc64: error while loading state section id 56(spapr/htab)
> qemu-system-ppc64: postcopy_ram_listen_thread: loadvm failed: -22
> qemu-system-ppc64: VQ 0 size 0x100 Guest index 0x0 inconsistent with Host 
> index 0x1f: delta 0xffe1
> qemu-system-ppc64: error while loading state for instance 0x0 of device 
> 'pci@8002000:00.0/virtio-net'
> *** Error in `./ppc64-softmmu/qemu-system-ppc64': corrupted double-linked 
> list: 0x0100241234a0 ***
> === Backtrace: =
> /lib64/power8/libc.so.6Segmentation fault

Good - my current world has got rid of the segfaults/corruption in the cleanup 
on power - but those
are only after it stumbled over the htab problem.

I don't know the innards of power/htab, so if you've got any pointers on what 
upset it
I'd be happy for some pointers.

(We should probably trim the cc - since I don't think this is userfault 
related).

Dave

--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH v2 06/18] pc: implement NVDIMM device abstract

2015-09-08 Thread Xiao Guangrong



On 09/07/2015 09:40 PM, Igor Mammedov wrote:

On Sun, 6 Sep 2015 14:07:21 +0800
Xiao Guangrong  wrote:




On 09/02/2015 07:31 PM, Igor Mammedov wrote:

On Wed, 2 Sep 2015 18:36:43 +0800
Xiao Guangrong  wrote:




On 09/02/2015 05:58 PM, Igor Mammedov wrote:

On Fri, 14 Aug 2015 22:51:59 +0800
Xiao Guangrong  wrote:


Introduce "pc-nvdimm" device and it has two parameters:

Why do you use prefix "pc-", I suppose we potentially
could use this device not only with x86 targets but with
other targets as well.
I'd just drop 'pc' prefix through out patchset.


Yeah, the prefix is stolen from pc-dimm, will drop this
prefix as your suggestion.




- @file, which is the backed memory file for NVDIMM device

Could you try to split device into backend/frontend parts,
like it's done with pc-dimm. As I understand it's preferred
way to implement this kind of devices.
Then you could reuse memory backends that we already have
including file backend.


I considered it too and Stefan, Paolo got the some idea in
V1's review, however:

| However, file-based memory used by NVDIMM is special, it divides the file
| to two parts, one part is used as PMEM and another part is used to store
| NVDIMM's configure data.
|
| Maybe we can introduce "end-reserved" property to reserve specified size
| at the end of the file. Or create a new class type based on
| memory-backend-file (named nvdimm-backend-file) class to hide this magic
| thing?

I'd go with separate backend/frontend idea.

Question is if this config area is part backend or frontend?


Configdata area is used to store nvdimm device's configuration, normally, it's
namespace info.

Currently, we chosen configdata located at the end of nvdimm's backend-memory
as it's easy to configure / use and configdata is naturally non-volatile and it
is like the layout on physical device.

However, using two separated backed-memory is okay, for example:
-object memory-backend-file,id=mem0,file=/storage/foo
-object memory-backend-file,id=mem1,file=/storage/bar
-device nvdimm,memdev=mem0,configdata=mem1
then configdata is written to a single backend.

Which one is better for you? :)


If we pass-through NVDIMM device do we need to set configdata=true
and QEMU would skip building config structures and use structures
that are already present on passed-through device in that place?



The file specified by @file is something like a normal disk, like /dev/sda/,
host process can use whole space on it. If we want to directly pass it to guest,
we can specify 'configdata=false'. If we allow guest to 'partition' (create
namespace on) it then we use 'configdata=true' to reserve some space to store
its partition info (namesapce info).

As far as I understand currently linux provides to userspace only one interface
which is block device i.e. /dev/sdX and on top of it userspace can put
PM/DAX aware filesystem and use files from it. In either cases kernel
just provides access to separate namespaces and not to a whole NVDIMM which
includes 'labels area'. Hence /dev/sdX is not passed-though NVDIMM,
so we could consider it as just a file/storage that could be used by userspace.



Yes, it is.


Lets assume that NVDIMM should always have 'labels area'.
In that case I'd always reserve space for it and
  * format it (build a new one) if backend doesn't have a
valid labels area dropping configdata parameter along the way
  * or if backing-file already has valid labels area I'd just use it.


Yes.



If you need to make labels area readonly you can introduce 
'NVDIMM.readonly_labels'
option and just use labels backend's without allowing changes writeback.
IT would be better to make it another series on top of basic NVDIMM 
implementation
if there is an actual usecase for it.


I'd prefer the way that discards not only its label data but also the whole 
nvdimm device,
that is, open(, RDONLY) + mmap(, MAP_PRIVATE), the idea was raised by Stefan.

The 'configdata = false' in this patchset does not aim at making label 
readonly, it provides
a way to make the file in a single partition. For example, you create a image 
in /dev/pmem0,
pass it to guest, then the whole file will appear at /dev/pmem0 in guest, and 
guest can directly
use the image in that device. Under this case, no file region is reserved and 
the label data is
build in memory which can not be updated by guest.



PS:
Also when you write commit messages, comment and name variables try to use 
terms from
relevant spec and mention specs where you describe data structures from them.


Parts of the names/definitions were stolen from Kernel NVDIMM driver, i will 
update it to
let them reflect the specs. Thanks, Igor.






--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH v2 08/18] nvdimm: init backend memory mapping and config data area

2015-09-08 Thread Xiao Guangrong



On 09/07/2015 10:11 PM, Igor Mammedov wrote:

On Fri, 14 Aug 2015 22:52:01 +0800
Xiao Guangrong  wrote:


The parameter @file is used as backed memory for NVDIMM which is
divided into two parts if @dataconfig is true:
- first parts is (0, size - 128K], which is used as PMEM (Persistent
   Memory)
- 128K at the end of the file, which is used as Config Data Area, it's
   used to store Label namespace data

The @file supports both regular file and block device, of course we
can assign any these two kinds of files for test and emulation, however,
in the real word for performance reason, we usually used these files as
NVDIMM backed file:
- the regular file in the filesystem with DAX enabled created on NVDIMM
   device on host
- the raw PMEM device on host, e,g /dev/pmem0


A lot of code in this series could reuse what QEMU already
uses for implementing pc-dimm devices.

here is common concepts that could be reused.
   - on physical system both DIMM and NVDIMM devices use
 the same slots. We could share QEMU's '-m slots' option between
 both devices. An alternative to not sharing would be to introduce
 '-machine nvdimm_slots' option.
 And yes, we need to know number of NVDIMMs to describe
 them all in ACPI table (taking in amount future hotplug
 include in this possible NVDIMM devices)
 I'd go the same way as on real hardware on make them share the same slots.


I'd prefer sharing slots for pc-dimm and nvdimm, it's easier to reuse the
logic of slot-assignment and plug/unplug.


   - they share the same physical address space and limits
 on how much memory system can handle. So I'd suggest sharing existing
 '-m maxmem' option and reuse hotplug_memory address space.


Sounds good to me.



Essentially what I'm suggesting is to inherit NVDIMM's implementation
from pc-dimm reusing all of its code/backends and
just override parts that do memory mapping into guest's address space to
accommodate NVDIMM's requirements.


Good idea!

We have to differentiate pc-dimm and nvdimm in the common code and nvdimm
has different points with pc-dimm (for example, its has reserved-region, and
need support live migration of label data). How about rename 'pc-nvdimm' to
'memory-device' and make it as a common device type, then build pc-dimm and
nvdimm on top of it?

Something like:
static TypeInfo memory_device_info = {
.name  = TYPE_MEM_DEV,
.parent= TYPE_DEVICE,
};

static TypeInfo memory_device_info = {
.name = TYPE_PC_DIMM,
.parent = TYPE_MEM_DEV,
};

static TypeInfo memory_device_info = {
.name = TYPE_NVDIMM,
.parent = TYPE_MEM_DEV,
};

It also make CONIFG_NVDIMM and CONFIG_HOT_PLUG be independent.





Signed-off-by: Xiao Guangrong 
---
  hw/mem/nvdimm/pc-nvdimm.c  | 109 -
  include/hw/mem/pc-nvdimm.h |   7 +++
  2 files changed, 115 insertions(+), 1 deletion(-)

diff --git a/hw/mem/nvdimm/pc-nvdimm.c b/hw/mem/nvdimm/pc-nvdimm.c
index 7a270a8..97710d1 100644
--- a/hw/mem/nvdimm/pc-nvdimm.c
+++ b/hw/mem/nvdimm/pc-nvdimm.c
@@ -22,12 +22,20 @@
   * License along with this library; if not, see 
   */

+#include 
+#include 
+#include 
+
+#include "exec/address-spaces.h"
  #include "hw/mem/pc-nvdimm.h"

-#define PAGE_SIZE  (1UL << 12)
+#define PAGE_SIZE   (1UL << 12)
+
+#define MIN_CONFIG_DATA_SIZE(128 << 10)

  static struct nvdimms_info {
  ram_addr_t current_addr;
+int device_index;
  } nvdimms_info;

  /* the address range [offset, ~0ULL) is reserved for NVDIMM. */
@@ -37,6 +45,26 @@ void pc_nvdimm_reserve_range(ram_addr_t offset)
  nvdimms_info.current_addr = offset;
  }

+static ram_addr_t reserved_range_push(uint64_t size)
+{
+uint64_t current;
+
+current = ROUND_UP(nvdimms_info.current_addr, PAGE_SIZE);
+
+/* do not have enough space? */
+if (current + size < current) {
+return 0;
+}
+
+nvdimms_info.current_addr = current + size;
+return current;
+}

You can't use all memory above hotplug_memory area since
we have to tell guest where 64-bit PCI window starts,
and currently it should start at reserved-memory-end
(but it isn't due to a bug: I've just posted fix to qemu-devel
  "[PATCH 0/2] pc: fix 64-bit PCI window clashing with memory hotplug region"
)


Ah, got it, thanks for you pointing it out.




+
+static uint32_t new_device_index(void)
+{
+return nvdimms_info.device_index++;
+}
+
  static char *get_file(Object *obj, Error **errp)
  {
  PCNVDIMMDevice *nvdimm = PC_NVDIMM(obj);
@@ -48,6 +76,11 @@ static void set_file(Object *obj, const char *str, Error 
**errp)
  {
  PCNVDIMMDevice *nvdimm = PC_NVDIMM(obj);

+if (memory_region_size(&nvdimm->mr)) {
+error_setg(errp, "cannot change property value");
+return;
+}
+
  if (nvdimm->file) {
  g_free(nvdimm->file);
  }
@@ -76,13 +109,87 @@ static void pc_nvdimm_init(Object *obj)
   set_configdata, NULL

Re: [Qemu-devel] [PATCH 19/23] userfaultfd: activate syscall

2015-09-08 Thread Bharata B Rao
On Tue, Sep 08, 2015 at 01:46:52PM +0100, Dr. David Alan Gilbert wrote:
> * Bharata B Rao (bhar...@linux.vnet.ibm.com) wrote:
> > On Tue, Sep 08, 2015 at 09:59:47AM +0100, Dr. David Alan Gilbert wrote:
> > > * Bharata B Rao (bhar...@linux.vnet.ibm.com) wrote:
> > > > In fact I had successfully done postcopy migration of sPAPR guest with
> > > > this setup.
> > > 
> > > Interesting - I'd not got that far myself on power; I was hitting a 
> > > problem
> > > loading htab ( htab_load() bad index 2113929216 (14848+0 entries) in htab 
> > > stream (htab_shift=25) )
> > > 
> > > Did you have to make any changes to the qemu code to get that happy?
> > 
> > I should have mentioned that I tried only QEMU driven migration within
> > the same host using wp3-postcopy branch of your tree. I don't see the
> > above issue.
> > 
> > (qemu) info migrate
> > capabilities: xbzrle: off rdma-pin-all: off auto-converge: off zero-blocks: 
> > off compress: off x-postcopy-ram: on 
> > Migration status: completed
> > total time: 39432 milliseconds
> > downtime: 162 milliseconds
> > setup: 14 milliseconds
> > transferred ram: 1297209 kbytes
> > throughput: 270.72 mbps
> > remaining ram: 0 kbytes
> > total ram: 4194560 kbytes
> > duplicate: 734015 pages
> > skipped: 0 pages
> > normal: 318469 pages
> > normal bytes: 1273876 kbytes
> > dirty sync count: 4
> > 
> > I will try migration between different hosts soon and check.
> 
> I hit that on the same host; are you sure you've switched into postcopy mode;
> i.e. issued a migrate_start_postcopy before the end of migration?

Sorry I was following your discussion with Li in this thread

https://www.marc.info/?l=qemu-devel&m=143035620026744&w=4

and it wasn't obvious to me that anything apart from turning on the
x-postcopy-ram capability was required :(

So I do see the problem now.

At the source
-
Error reading data from KVM HTAB fd: Bad file descriptor
Segmentation fault

At the target
-
htab_load() bad index 2113929216 (14336+0 entries) in htab stream 
(htab_shift=25)
qemu-system-ppc64: error while loading state section id 56(spapr/htab)
qemu-system-ppc64: postcopy_ram_listen_thread: loadvm failed: -22
qemu-system-ppc64: VQ 0 size 0x100 Guest index 0x0 inconsistent with Host index 
0x1f: delta 0xffe1
qemu-system-ppc64: error while loading state for instance 0x0 of device 
'pci@8002000:00.0/virtio-net'
*** Error in `./ppc64-softmmu/qemu-system-ppc64': corrupted double-linked list: 
0x0100241234a0 ***
=== Backtrace: =
/lib64/power8/libc.so.6Segmentation fault

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH 19/23] userfaultfd: activate syscall

2015-09-08 Thread Dr. David Alan Gilbert
* Bharata B Rao (bhar...@linux.vnet.ibm.com) wrote:
> On Tue, Sep 08, 2015 at 09:59:47AM +0100, Dr. David Alan Gilbert wrote:
> > * Bharata B Rao (bhar...@linux.vnet.ibm.com) wrote:
> > > In fact I had successfully done postcopy migration of sPAPR guest with
> > > this setup.
> > 
> > Interesting - I'd not got that far myself on power; I was hitting a problem
> > loading htab ( htab_load() bad index 2113929216 (14848+0 entries) in htab 
> > stream (htab_shift=25) )
> > 
> > Did you have to make any changes to the qemu code to get that happy?
> 
> I should have mentioned that I tried only QEMU driven migration within
> the same host using wp3-postcopy branch of your tree. I don't see the
> above issue.
> 
> (qemu) info migrate
> capabilities: xbzrle: off rdma-pin-all: off auto-converge: off zero-blocks: 
> off compress: off x-postcopy-ram: on 
> Migration status: completed
> total time: 39432 milliseconds
> downtime: 162 milliseconds
> setup: 14 milliseconds
> transferred ram: 1297209 kbytes
> throughput: 270.72 mbps
> remaining ram: 0 kbytes
> total ram: 4194560 kbytes
> duplicate: 734015 pages
> skipped: 0 pages
> normal: 318469 pages
> normal bytes: 1273876 kbytes
> dirty sync count: 4
> 
> I will try migration between different hosts soon and check.

I hit that on the same host; are you sure you've switched into postcopy mode;
i.e. issued a migrate_start_postcopy before the end of migration?

(My current world I have working on x86-64 and I've also tested reasonably well
on aarch64, and get to that htab problem on Power).

Dave

> 
> Regards,
> Bharata.
> 
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH 19/23] userfaultfd: activate syscall

2015-09-08 Thread Dr. David Alan Gilbert
* Michael Ellerman (m...@ellerman.id.au) wrote:
> On Tue, 2015-09-08 at 17:14 +1000, Michael Ellerman wrote:
> > On Tue, 2015-09-08 at 12:09 +0530, Bharata B Rao wrote:
> > > On Tue, Sep 08, 2015 at 04:08:06PM +1000, Michael Ellerman wrote:
> > > > Hmm, not for me. See below.
> > > > 
> > > > What setup were you testing on Bharata?
> > > 
> > > I was on commit a94572f5799dd of userfault21 branch in Andrea's tree
> > > git://git.kernel.org/pub/scm/linux/kernel/git/andrea/aa.git
> > > 
> > > #uname -a
> > > Linux 4.1.0-rc8+ #1 SMP Tue Aug 11 11:33:50 IST 2015 ppc64le ppc64le 
> > > ppc64le GNU/Linux
> > > 
> > > In fact I had successfully done postcopy migration of sPAPR guest with
> > > this setup.
> > 
> > OK, do you mind testing mainline with the same setup to see if the selftest
> > passes.
> 
> Ah, I just tried it on big endian and it works. So it seems to not work on
> little endian for some reason, /probably/ a test case bug?

Hmm; I think we're missing a test-case fix that Andrea made me for a bug I hit 
on Power
I hit a couple of weeks back.  I think that would have been on le.

Dave

> cheers
> 
> 
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 07/10] KVM: arm/arm64: vgic: Allow HW interrupts for non-shared devices

2015-09-08 Thread Eric Auger
Hi Christoffer,
On 09/02/2015 09:42 PM, Christoffer Dall wrote:
> On Mon, Aug 10, 2015 at 03:21:01PM +0200, Eric Auger wrote:
>> From: Marc Zyngier 
>>
>> So far, the only use of the HW interrupt facility was the timer,
>> implying that the active state is context-switched for each vcpu,
>> as the device is is shared across all vcpus.
>>
>> This does not work for a device that has been assigned to a VM,
>> as the guest is entierely in control of that device (the HW is
>> not shared). In that case, it makes sense to bypass the whole
>> active state switching.
>>
>> Also the VGIC state machine is adapted to support those assigned
>> (non shared) HW IRQs:
>> - nly can be sampled when it is pending
>> - when queueing the IRQ (programming the LR), the pending state is
>>   removed as for edge sensitive IRQs
>> - queued state is not modelled. Level state is not modelled
>> - its injection always is valid since steming from the HW.
>>
>> Signed-off-by: Marc Zyngier 
>> Signed-off-by: Eric Auger 
>>
>> ---
>>
>> - a mix of
>>   [PATCH v4 11/11] KVM: arm/arm64: vgic: Allow HW interrupts for
>>non-shared devices
>>   [RFC v2 2/4] KVM: arm: vgic: fix state machine for forwarded IRQ
>> ---
>>  include/kvm/arm_vgic.h|  6 +++--
>>  virt/kvm/arm/arch_timer.c |  3 ++-
>>  virt/kvm/arm/vgic.c   | 58 
>> +++
>>  3 files changed, 49 insertions(+), 18 deletions(-)
>>
>> diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
>> index d901f1a..7ef9ce0 100644
>> --- a/include/kvm/arm_vgic.h
>> +++ b/include/kvm/arm_vgic.h
>> @@ -163,7 +163,8 @@ struct irq_phys_map {
>>  u32 virt_irq;
>>  u32 phys_irq;
>>  u32 irq;
>> -boolactive;
>> +boolshared;
>> +boolactive; /* Only valid if shared */
>>  };
>>  
>>  struct irq_phys_map_entry {
>> @@ -356,7 +357,8 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 
>> reg);
>>  int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
>>  int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
>>  struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu,
>> -   int virt_irq, int irq);
>> +   int virt_irq, int irq,
>> +   bool shared);
>>  int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map 
>> *map);
>>  bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map);
>>  void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active);
>> diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
>> index 76e38d2..db21d8f 100644
>> --- a/virt/kvm/arm/arch_timer.c
>> +++ b/virt/kvm/arm/arch_timer.c
>> @@ -203,7 +203,8 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
>>   * Tell the VGIC that the virtual interrupt is tied to a
>>   * physical interrupt. We do that once per VCPU.
>>   */
>> -map = kvm_vgic_map_phys_irq(vcpu, irq->irq, host_vtimer_irq);
>> +map = kvm_vgic_map_phys_irq(vcpu, irq->irq,
>> +host_vtimer_irq, true);
>>  if (WARN_ON(IS_ERR(map)))
>>  return PTR_ERR(map);
>>  
>> diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
>> index 9eb489a..fbd5ba5 100644
>> --- a/virt/kvm/arm/vgic.c
>> +++ b/virt/kvm/arm/vgic.c
>> @@ -400,7 +400,11 @@ void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq)
>>  
>>  static bool vgic_can_sample_irq(struct kvm_vcpu *vcpu, int irq)
>>  {
>> -return !vgic_irq_is_queued(vcpu, irq);
>> +struct irq_phys_map *map = vgic_irq_map_search(vcpu, irq);
>> +bool shared_hw = map && !map->shared;
> 
> why is shared true when map->shared is false?
definitively upside down
> 
>> +
>> +return !vgic_irq_is_queued(vcpu, irq) ||
>> +(shared_hw && vgic_dist_irq_is_pending(vcpu, irq));
> 
> so for forwarded, non-shared, level-triggered IRQs, we always sample the
> line if it's pending?  Why?

No we only sampled it if it was pending. The pending state was reset
when programming the LR.

Now that we model the queued state for mapped IRQ I will use that instead
> 
>>  }
>>  
>>  /**
>> @@ -1150,19 +1154,26 @@ static void vgic_queue_irq_to_lr(struct kvm_vcpu 
>> *vcpu, int irq,
>>   * active in the physical world. Otherwise the
>>   * physical interrupt will fire and the guest will
>>   * exit before processing the virtual interrupt.
>> + *
>> + * This is of course only valid for a shared
>> + * interrupt. A non shared interrupt should already be
>> + * active.
>>   */
>>  if (map) {
>> -int ret;
>> -
>> -BUG_ON(!map->active);
>>  vlr.hwirq = map->phys_irq;
>>  vlr.state |= LR_HW;
>>  vlr.state &= ~LR_

Re: [Qemu-devel] [PATCH 19/23] userfaultfd: activate syscall

2015-09-08 Thread Michael Ellerman
On Tue, 2015-09-08 at 17:14 +1000, Michael Ellerman wrote:
> On Tue, 2015-09-08 at 12:09 +0530, Bharata B Rao wrote:
> > On Tue, Sep 08, 2015 at 04:08:06PM +1000, Michael Ellerman wrote:
> > > Hmm, not for me. See below.
> > > 
> > > What setup were you testing on Bharata?
> > 
> > I was on commit a94572f5799dd of userfault21 branch in Andrea's tree
> > git://git.kernel.org/pub/scm/linux/kernel/git/andrea/aa.git
> > 
> > #uname -a
> > Linux 4.1.0-rc8+ #1 SMP Tue Aug 11 11:33:50 IST 2015 ppc64le ppc64le 
> > ppc64le GNU/Linux
> > 
> > In fact I had successfully done postcopy migration of sPAPR guest with
> > this setup.
> 
> OK, do you mind testing mainline with the same setup to see if the selftest
> passes.

Ah, I just tried it on big endian and it works. So it seems to not work on
little endian for some reason, /probably/ a test case bug?

cheers


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH 19/23] userfaultfd: activate syscall

2015-09-08 Thread Bharata B Rao
On Tue, Sep 08, 2015 at 09:59:47AM +0100, Dr. David Alan Gilbert wrote:
> * Bharata B Rao (bhar...@linux.vnet.ibm.com) wrote:
> > In fact I had successfully done postcopy migration of sPAPR guest with
> > this setup.
> 
> Interesting - I'd not got that far myself on power; I was hitting a problem
> loading htab ( htab_load() bad index 2113929216 (14848+0 entries) in htab 
> stream (htab_shift=25) )
> 
> Did you have to make any changes to the qemu code to get that happy?

I should have mentioned that I tried only QEMU driven migration within
the same host using wp3-postcopy branch of your tree. I don't see the
above issue.

(qemu) info migrate
capabilities: xbzrle: off rdma-pin-all: off auto-converge: off zero-blocks: off 
compress: off x-postcopy-ram: on 
Migration status: completed
total time: 39432 milliseconds
downtime: 162 milliseconds
setup: 14 milliseconds
transferred ram: 1297209 kbytes
throughput: 270.72 mbps
remaining ram: 0 kbytes
total ram: 4194560 kbytes
duplicate: 734015 pages
skipped: 0 pages
normal: 318469 pages
normal bytes: 1273876 kbytes
dirty sync count: 4

I will try migration between different hosts soon and check.

Regards,
Bharata.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-ppc] KVM memory slots limit on powerpc

2015-09-08 Thread Thomas Huth
On 08/09/15 09:11, Christian Borntraeger wrote:
> Am 08.09.2015 um 08:05 schrieb Thomas Huth:
>> On 07/09/15 16:31, Igor Mammedov wrote:
>>> On Fri, 4 Sep 2015 12:04:41 +0200
>>> Alexander Graf  wrote:
 On 04.09.15 11:59, Christian Borntraeger wrote:
> Am 04.09.2015 um 11:35 schrieb Thomas Huth:
>>
>> now that we get memory hotplugging for the spapr machine on qemu-ppc,
>> too, it seems like we easily can hit the amount of KVM-internal memory
>> slots now ("#define KVM_USER_MEM_SLOTS 32" in
>> arch/powerpc/include/asm/kvm_host.h). For example, start
>> qemu-system-ppc64 with a couple of "-device secondary-vga" and "-m
>> 4G,slots=32,maxmem=40G" and then try to hot-plug all 32 DIMMs ... and
>> you'll see that it aborts way earlier already.
>>
>> The x86 code already increased the amount of KVM_USER_MEM_SLOTS to 509
>> already (+3 internal slots = 512) ... maybe we should now increase the
>> amount of slots on powerpc, too? Since we don't use internal slots on
>> POWER, would 512 be a good value? Or would less be sufficient, too?
>
> When you are at it, the s390 value should also be increased I guess.

 That constant defines the array size for the memslot array in struct kvm
 which in turn again gets allocated by kzalloc, so it's pinned kernel
 memory that is physically contiguous. Doing big allocations can turn
 into problems during runtime.

 So maybe there is another way? Can we extend the memslot array size
 dynamically somehow? Allocate it separately? How much memory does the
 memslot array use up with 512 entries?
>>>
>>> KVM switched memslots allocation to kvm_kvzalloc(), so it would fallback to 
>>> vmalloc
>>>  commit 744961341d472db6272ed9b42319a90f5a2aa7c4
>>>  kvm: avoid page allocation failure in kvm_set_memory_region()
>>
>> Good hint, thanks for pointing that out! ... so increasing the array
>> size should not cause too much trouble :-)
> 
> Changing the allocation of the memslots to a growing structure seems like a
> good idea nevertheless. Any chance to do this as well when you are at it?

I'd like to finish some other stuff first, so if you want to have a try,
feel free to do so ... if not, I'll have a closer look at this later.

 Thomas


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH 19/23] userfaultfd: activate syscall

2015-09-08 Thread Dr. David Alan Gilbert
* Bharata B Rao (bhar...@linux.vnet.ibm.com) wrote:
> On Tue, Sep 08, 2015 at 04:08:06PM +1000, Michael Ellerman wrote:
> > On Wed, 2015-08-12 at 10:53 +0530, Bharata B Rao wrote:
> > > On Tue, Aug 11, 2015 at 03:48:26PM +0200, Andrea Arcangeli wrote:
> > > > Hello Bharata,
> > > > 
> > > > On Tue, Aug 11, 2015 at 03:37:29PM +0530, Bharata B Rao wrote:
> > > > > May be it is a bit late to bring this up, but I needed the following 
> > > > > fix
> > > > > to userfault21 branch of your git tree to compile on powerpc.
> > > > 
> > > > Not late, just in time. I increased the number of syscalls in earlier
> > > > versions, it must have gotten lost during a rejecting rebase, sorry.
> > > > 
> > > > I applied it to my tree and it can be applied to -mm and linux-next,
> > > > thanks!
> > > > 
> > > > The syscall for arm32 are also ready and on their way to the arm tree,
> > > > the testsuite worked fine there. ppc also should work fine if you
> > > > could confirm it'd be interesting, just beware that I got a typo in
> > > > the testcase:
> > > 
> > > The testsuite passes on powerpc.
> > > 
> > > 
> > > running userfaultfd
> > > 
> > > nr_pages: 2040, nr_pages_per_cpu: 170
> > > bounces: 31, mode: rnd racing ver poll, userfaults: 80 43 23 23 15 16 12 
> > > 1 2 96 13 128
> > > bounces: 30, mode: racing ver poll, userfaults: 35 54 62 49 47 48 2 8 0 
> > > 78 1 0
> > > bounces: 29, mode: rnd ver poll, userfaults: 114 153 70 106 78 57 143 92 
> > > 114 96 1 0
> > > bounces: 28, mode: ver poll, userfaults: 96 81 5 45 83 19 98 28 1 145 23 2
> > > bounces: 27, mode: rnd racing poll, userfaults: 54 65 60 54 45 49 1 2 1 2 
> > > 71 20
> > > bounces: 26, mode: racing poll, userfaults: 90 83 35 29 37 35 30 42 3 4 
> > > 49 6
> > > bounces: 25, mode: rnd poll, userfaults: 52 50 178 112 51 41 23 42 18 99 
> > > 59 0
> > > bounces: 24, mode: poll, userfaults: 136 101 83 260 84 29 16 88 1 6 160 57
> > > bounces: 23, mode: rnd racing ver, userfaults: 141 197 158 183 39 49 3 52 
> > > 8 3 6 0
> > > bounces: 22, mode: racing ver, userfaults: 242 266 244 180 162 32 87 43 
> > > 31 40 34 0
> > > bounces: 21, mode: rnd ver, userfaults: 636 158 175 24 253 104 48 8 0 0 0 > > > 0
> > > bounces: 20, mode: ver, userfaults: 531 204 225 117 129 107 11 143 76 31 
> > > 1 0
> > > bounces: 19, mode: rnd racing, userfaults: 303 169 225 145 59 219 37 0 0 
> > > 0 0 0
> > > bounces: 18, mode: racing, userfaults: 374 372 37 144 126 90 25 12 15 17 
> > > 0 0
> > > bounces: 17, mode: rnd, userfaults: 313 412 134 108 80 99 7 56 85 0 0 0
> > > bounces: 16, mode:, userfaults: 431 58 87 167 120 113 98 60 14 8 48 0
> > > bounces: 15, mode: rnd racing ver poll, userfaults: 41 40 25 28 37 24 0 0 
> > > 0 0 180 75
> > > bounces: 14, mode: racing ver poll, userfaults: 43 53 30 28 25 15 19 0 0 
> > > 0 0 30
> > > bounces: 13, mode: rnd ver poll, userfaults: 136 91 114 91 92 79 114 77 
> > > 75 68 1 2
> > > bounces: 12, mode: ver poll, userfaults: 92 120 114 76 153 75 132 157 83 
> > > 81 10 1
> > > bounces: 11, mode: rnd racing poll, userfaults: 50 72 69 52 53 48 46 59 
> > > 57 51 37 1
> > > bounces: 10, mode: racing poll, userfaults: 33 49 38 68 35 63 57 49 49 47 
> > > 25 10
> > > bounces: 9, mode: rnd poll, userfaults: 167 150 67 123 39 75 1 2 9 125 1 1
> > > bounces: 8, mode: poll, userfaults: 147 102 20 87 5 27 118 14 104 40 21 28
> > > bounces: 7, mode: rnd racing ver, userfaults: 305 254 208 74 59 96 36 14 
> > > 11 7 4 5
> > > bounces: 6, mode: racing ver, userfaults: 290 114 191 94 162 114 34 6 6 
> > > 32 23 2
> > > bounces: 5, mode: rnd ver, userfaults: 370 381 22 273 21 106 17 55 0 0 0 0
> > > bounces: 4, mode: ver, userfaults: 328 279 179 191 74 86 95 15 13 10 0 0
> > > bounces: 3, mode: rnd racing, userfaults: 222 215 164 70 5 20 179 0 34 3 
> > > 0 0
> > > bounces: 2, mode: racing, userfaults: 316 385 112 160 225 5 30 49 42 2 4 0
> > > bounces: 1, mode: rnd, userfaults: 273 139 253 176 163 71 85 2 0 0 0 0
> > > bounces: 0, mode:, userfaults: 165 212 633 13 24 66 24 27 15 0 10 1
> > > [PASS]
> > 
> > Hmm, not for me. See below.
> > 
> > What setup were you testing on Bharata?
> 
> I was on commit a94572f5799dd of userfault21 branch in Andrea's tree
> git://git.kernel.org/pub/scm/linux/kernel/git/andrea/aa.git
> 
> #uname -a
> Linux 4.1.0-rc8+ #1 SMP Tue Aug 11 11:33:50 IST 2015 ppc64le ppc64le ppc64le 
> GNU/Linux
> 
> In fact I had successfully done postcopy migration of sPAPR guest with
> this setup.

Interesting - I'd not got that far myself on power; I was hitting a problem
loading htab ( htab_load() bad index 2113929216 (14848+0 entries) in htab 
stream (htab_shift=25) )

Did you have to make any changes to the qemu code to get that happy?

Dave

> > 
> > Mine is:
> > 
> > $ uname -a
> > Linux lebuntu 4.2.0-09705-g3a166acc1432 #2 SMP Tue Sep 8 15:18:00 AEST 2015 
> > ppc64le ppc64le ppc64le GNU/Linux
> > 
> > Which is 7d9071a09502 plus a couple of powerpc patches.
> > 
> > $ zgrep USERFAULTFD /proc/c

Re: [PATCH v2 6/8] arm/arm64: KVM: Add forwarded physical interrupts documentation

2015-09-08 Thread Eric Auger
Hi Andre,
On 09/07/2015 01:25 PM, Andre Przywara wrote:
> Hi,
> 
> firstly: this text is really great, thanks for coming up with that.
> See below for some information I got from tracing the host which I
> cannot make sense of
> 
> 
> On 04/09/15 20:40, Christoffer Dall wrote:
>> Forwarded physical interrupts on arm/arm64 is a tricky concept and the
>> way we deal with them is not apparently easy to understand by reading
>> various specs.
>>
>> Therefore, add a proper documentation file explaining the flow and
>> rationale of the behavior of the vgic.
>>
>> Some of this text was contributed by Marc Zyngier and edited by me.
>> Omissions and errors are all mine.
>>
>> Signed-off-by: Christoffer Dall 
>> ---
>>  Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt | 181 
>> +
>>  1 file changed, 181 insertions(+)
>>  create mode 100644 Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt
>>
>> diff --git a/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt 
>> b/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt
>> new file mode 100644
>> index 000..24b6f28
>> --- /dev/null
>> +++ b/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt
>> @@ -0,0 +1,181 @@
>> +KVM/ARM VGIC Forwarded Physical Interrupts
>> +==
>> +
>> +The KVM/ARM code implements software support for the ARM Generic
>> +Interrupt Controller's (GIC's) hardware support for virtualization by
>> +allowing software to inject virtual interrupts to a VM, which the guest
>> +OS sees as regular interrupts.  The code is famously known as the VGIC.
>> +
>> +Some of these virtual interrupts, however, correspond to physical
>> +interrupts from real physical devices.  One example could be the
>> +architected timer, which itself supports virtualization, and therefore
>> +lets a guest OS program the hardware device directly to raise an
>> +interrupt at some point in time.  When such an interrupt is raised, the
>> +host OS initially handles the interrupt and must somehow signal this
>> +event as a virtual interrupt to the guest.  Another example could be a
>> +passthrough device, where the physical interrupts are initially handled
>> +by the host, but the device driver for the device lives in the guest OS
>> +and KVM must therefore somehow inject a virtual interrupt on behalf of
>> +the physical one to the guest OS.
>> +
>> +These virtual interrupts corresponding to a physical interrupt on the
>> +host are called forwarded physical interrupts, but are also sometimes
>> +referred to as 'virtualized physical interrupts' and 'mapped interrupts'.
>> +
>> +Forwarded physical interrupts are handled slightly differently compared
>> +to virtual interrupts generated purely by a software emulated device.
>> +
>> +
>> +The HW bit
>> +--
>> +Virtual interrupts are signalled to the guest by programming the List
>> +Registers (LRs) on the GIC before running a VCPU.  The LR is programmed
>> +with the virtual IRQ number and the state of the interrupt (Pending,
>> +Active, or Pending+Active).  When the guest ACKs and EOIs a virtual
>> +interrupt, the LR state moves from Pending to Active, and finally to
>> +inactive.
>> +
>> +The LRs include an extra bit, called the HW bit.  When this bit is set,
>> +KVM must also program an additional field in the LR, the physical IRQ
>> +number, to link the virtual with the physical IRQ.
>> +
>> +When the HW bit is set, KVM must EITHER set the Pending OR the Active
>> +bit, never both at the same time.
>> +
>> +Setting the HW bit causes the hardware to deactivate the physical
>> +interrupt on the physical distributor when the guest deactivates the
>> +corresponding virtual interrupt.
>> +
>> +
>> +Forwarded Physical Interrupts Life Cycle
>> +
>> +
>> +The state of forwarded physical interrupts is managed in the following way:
>> +
>> +  - The physical interrupt is acked by the host, and becomes active on
>> +the physical distributor (*).
>> +  - KVM sets the LR.Pending bit, because this is the only way the GICV
>> +interface is going to present it to the guest.
>> +  - LR.Pending will stay set as long as the guest has not acked the 
>> interrupt.
>> +  - LR.Pending transitions to LR.Active on the guest read of the IAR, as
>> +expected.
>> +  - On guest EOI, the *physical distributor* active bit gets cleared,
>> +but the LR.Active is left untouched (set).
> 
> I tried hard in the last week, but couldn't confirm this. Tracing shows
> the following pattern over and over (case 1):
> (This is the kvm/kvm.git:queue branch from last week, so including the
> mapped timer IRQ code. Tests were done on Juno and Midway)
> 
> ...
> 229.340171: kvm_exit: TRAP: HSR_EC: 0x0001 (WFx), PC: 0xffc98a64
> 229.340324: kvm_exit: IRQ: HSR_EC: 0x0001 (WFx), PC: 0xffc0001c63a0
> 229.340428: kvm_exit: TRAP: HSR_EC: 0x0024 (DABT_LOW), PC:
> 0xffc0004089d8
> 229.340430: kvm_vgic_sync_hwstate: LR0 vIRQ: 27, HWIRQ: 27, LR.state: 8,

Re: [PATCH v2 6/8] arm/arm64: KVM: Add forwarded physical interrupts documentation

2015-09-08 Thread Eric Auger
Hi Marc,
On 09/07/2015 07:50 PM, Marc Zyngier wrote:
> On 07/09/15 17:45, Eric Auger wrote:
>> Hi Christoffer,
>> On 09/04/2015 09:40 PM, Christoffer Dall wrote:
>>> Forwarded physical interrupts on arm/arm64 is a tricky concept and the
>>> way we deal with them is not apparently easy to understand by reading
>>> various specs.
>>>
>>> Therefore, add a proper documentation file explaining the flow and
>>> rationale of the behavior of the vgic.
>>>
>>> Some of this text was contributed by Marc Zyngier and edited by me.
>>> Omissions and errors are all mine.
>>>
>>> Signed-off-by: Christoffer Dall 
>>> ---
>>>  Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt | 181 
>>> +
>>>  1 file changed, 181 insertions(+)
>>>  create mode 100644 Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt
>>>
>>> diff --git a/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt 
>>> b/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt
>>> new file mode 100644
>>> index 000..24b6f28
>>> --- /dev/null
>>> +++ b/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt
>>> @@ -0,0 +1,181 @@
>>> +KVM/ARM VGIC Forwarded Physical Interrupts
>>> +==
> 
> [...]
> 
>>> +1.  KVM runs the VCPU
>>> +2.  The guest programs the time to fire in T+100
>>> +4.  At T+100 the timer fires and a physical IRQ causes the VM to exit
>>> +5.  With interrupts disabled on the CPU, KVM looks at the timer state
>>> +and injects a forwarded physical interrupt because it concludes the
>>> +timer has expired.
>> I don't get how we can trap without the virtual timer PPI handler being
>> entered on host side. Please can you elaborate on this?
> 
> On VM exit, we disable the virtual timer (see the code in
> hyp.S::save_timer_state where we clear the enable bit). We still perform
> the exit, but the cause for exit is now gone, and the handler will never
> fire.
OK thanks for the clarification

Eric
> 
> Thanks,
> 
>   M.
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH 19/23] userfaultfd: activate syscall

2015-09-08 Thread Michael Ellerman
On Tue, 2015-09-08 at 12:09 +0530, Bharata B Rao wrote:
> On Tue, Sep 08, 2015 at 04:08:06PM +1000, Michael Ellerman wrote:
> > On Wed, 2015-08-12 at 10:53 +0530, Bharata B Rao wrote:
> > > On Tue, Aug 11, 2015 at 03:48:26PM +0200, Andrea Arcangeli wrote:
> > > > Hello Bharata,
> > > > 
> > > > On Tue, Aug 11, 2015 at 03:37:29PM +0530, Bharata B Rao wrote:
> > > > > May be it is a bit late to bring this up, but I needed the following 
> > > > > fix
> > > > > to userfault21 branch of your git tree to compile on powerpc.
> > > > 
> > > > Not late, just in time. I increased the number of syscalls in earlier
> > > > versions, it must have gotten lost during a rejecting rebase, sorry.
> > > > 
> > > > I applied it to my tree and it can be applied to -mm and linux-next,
> > > > thanks!
> > > > 
> > > > The syscall for arm32 are also ready and on their way to the arm tree,
> > > > the testsuite worked fine there. ppc also should work fine if you
> > > > could confirm it'd be interesting, just beware that I got a typo in
> > > > the testcase:
> > > 
> > > The testsuite passes on powerpc.
> > > 
> > > 
> > > running userfaultfd
> > > 
> > > nr_pages: 2040, nr_pages_per_cpu: 170
> > > bounces: 31, mode: rnd racing ver poll, userfaults: 80 43 23 23 15 16 12 
> > > 1 2 96 13 128
> > > bounces: 30, mode: racing ver poll, userfaults: 35 54 62 49 47 48 2 8 0 
> > > 78 1 0
> > > bounces: 29, mode: rnd ver poll, userfaults: 114 153 70 106 78 57 143 92 
> > > 114 96 1 0
> > > bounces: 28, mode: ver poll, userfaults: 96 81 5 45 83 19 98 28 1 145 23 2
> > > bounces: 27, mode: rnd racing poll, userfaults: 54 65 60 54 45 49 1 2 1 2 
> > > 71 20
> > > bounces: 26, mode: racing poll, userfaults: 90 83 35 29 37 35 30 42 3 4 
> > > 49 6
> > > bounces: 25, mode: rnd poll, userfaults: 52 50 178 112 51 41 23 42 18 99 
> > > 59 0
> > > bounces: 24, mode: poll, userfaults: 136 101 83 260 84 29 16 88 1 6 160 57
> > > bounces: 23, mode: rnd racing ver, userfaults: 141 197 158 183 39 49 3 52 
> > > 8 3 6 0
> > > bounces: 22, mode: racing ver, userfaults: 242 266 244 180 162 32 87 43 
> > > 31 40 34 0
> > > bounces: 21, mode: rnd ver, userfaults: 636 158 175 24 253 104 48 8 0 0 0 > > > 0
> > > bounces: 20, mode: ver, userfaults: 531 204 225 117 129 107 11 143 76 31 
> > > 1 0
> > > bounces: 19, mode: rnd racing, userfaults: 303 169 225 145 59 219 37 0 0 
> > > 0 0 0
> > > bounces: 18, mode: racing, userfaults: 374 372 37 144 126 90 25 12 15 17 
> > > 0 0
> > > bounces: 17, mode: rnd, userfaults: 313 412 134 108 80 99 7 56 85 0 0 0
> > > bounces: 16, mode:, userfaults: 431 58 87 167 120 113 98 60 14 8 48 0
> > > bounces: 15, mode: rnd racing ver poll, userfaults: 41 40 25 28 37 24 0 0 
> > > 0 0 180 75
> > > bounces: 14, mode: racing ver poll, userfaults: 43 53 30 28 25 15 19 0 0 
> > > 0 0 30
> > > bounces: 13, mode: rnd ver poll, userfaults: 136 91 114 91 92 79 114 77 
> > > 75 68 1 2
> > > bounces: 12, mode: ver poll, userfaults: 92 120 114 76 153 75 132 157 83 
> > > 81 10 1
> > > bounces: 11, mode: rnd racing poll, userfaults: 50 72 69 52 53 48 46 59 
> > > 57 51 37 1
> > > bounces: 10, mode: racing poll, userfaults: 33 49 38 68 35 63 57 49 49 47 
> > > 25 10
> > > bounces: 9, mode: rnd poll, userfaults: 167 150 67 123 39 75 1 2 9 125 1 1
> > > bounces: 8, mode: poll, userfaults: 147 102 20 87 5 27 118 14 104 40 21 28
> > > bounces: 7, mode: rnd racing ver, userfaults: 305 254 208 74 59 96 36 14 
> > > 11 7 4 5
> > > bounces: 6, mode: racing ver, userfaults: 290 114 191 94 162 114 34 6 6 
> > > 32 23 2
> > > bounces: 5, mode: rnd ver, userfaults: 370 381 22 273 21 106 17 55 0 0 0 0
> > > bounces: 4, mode: ver, userfaults: 328 279 179 191 74 86 95 15 13 10 0 0
> > > bounces: 3, mode: rnd racing, userfaults: 222 215 164 70 5 20 179 0 34 3 
> > > 0 0
> > > bounces: 2, mode: racing, userfaults: 316 385 112 160 225 5 30 49 42 2 4 0
> > > bounces: 1, mode: rnd, userfaults: 273 139 253 176 163 71 85 2 0 0 0 0
> > > bounces: 0, mode:, userfaults: 165 212 633 13 24 66 24 27 15 0 10 1
> > > [PASS]
> > 
> > Hmm, not for me. See below.
> > 
> > What setup were you testing on Bharata?
> 
> I was on commit a94572f5799dd of userfault21 branch in Andrea's tree
> git://git.kernel.org/pub/scm/linux/kernel/git/andrea/aa.git
> 
> #uname -a
> Linux 4.1.0-rc8+ #1 SMP Tue Aug 11 11:33:50 IST 2015 ppc64le ppc64le ppc64le 
> GNU/Linux
> 
> In fact I had successfully done postcopy migration of sPAPR guest with
> this setup.

OK, do you mind testing mainline with the same setup to see if the selftest
passes.

cheers



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-ppc] KVM memory slots limit on powerpc

2015-09-08 Thread Christian Borntraeger
Am 08.09.2015 um 08:05 schrieb Thomas Huth:
> On 07/09/15 16:31, Igor Mammedov wrote:
>> On Fri, 4 Sep 2015 12:04:41 +0200
>> Alexander Graf  wrote:
>>
>>>
>>>
>>> On 04.09.15 11:59, Christian Borntraeger wrote:
 Am 04.09.2015 um 11:35 schrieb Thomas Huth:
>
>  Hi all,
>
> now that we get memory hotplugging for the spapr machine on qemu-ppc,
> too, it seems like we easily can hit the amount of KVM-internal memory
> slots now ("#define KVM_USER_MEM_SLOTS 32" in
> arch/powerpc/include/asm/kvm_host.h). For example, start
> qemu-system-ppc64 with a couple of "-device secondary-vga" and "-m
> 4G,slots=32,maxmem=40G" and then try to hot-plug all 32 DIMMs ... and
> you'll see that it aborts way earlier already.
>
> The x86 code already increased the amount of KVM_USER_MEM_SLOTS to 509
> already (+3 internal slots = 512) ... maybe we should now increase the
> amount of slots on powerpc, too? Since we don't use internal slots on
> POWER, would 512 be a good value? Or would less be sufficient, too?

 When you are at it, the s390 value should also be increased I guess.
>>>
>>> That constant defines the array size for the memslot array in struct kvm
>>> which in turn again gets allocated by kzalloc, so it's pinned kernel
>>> memory that is physically contiguous. Doing big allocations can turn
>>> into problems during runtime.
>>>
>>> So maybe there is another way? Can we extend the memslot array size
>>> dynamically somehow? Allocate it separately? How much memory does the
>>> memslot array use up with 512 entries?
>>
>> KVM switched memslots allocation to kvm_kvzalloc(), so it would fallback to 
>> vmalloc
>>  commit 744961341d472db6272ed9b42319a90f5a2aa7c4
>>  kvm: avoid page allocation failure in kvm_set_memory_region()
> 
> Good hint, thanks for pointing that out! ... so increasing the array
> size should not cause too much trouble :-)

Changing the allocation of the memslots to a growing structure seems like a
good idea nevertheless. Any chance to do this as well when you are at it?

Christian

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html