[PATCH] ARM/ARM64: KVM: remove 'config KVM_ARM_MAX_VCPUS'

2015-09-01 Thread Ming Lei
This patch removes config option of KVM_ARM_MAX_VCPUS,
and like other ARCHs, just choose the maximum allowed
value from hardware, and follows the reasons:

1) from distribution view, the option has to be
defined as the max allowed value because it need to
meet all kinds of virtulization applications and
need to support most of SoCs;

2) using a bigger value doesn't introduce extra memory
consumption, and the help text in Kconfig isn't accurate
because kvm_vpu structure isn't allocated until request
of creating VCPU is sent from QEMU;

3) the main effect is that the field of vcpus[] in 'struct kvm'
becomes a bit bigger(sizeof(void *) per vcpu) and need more cache
lines to hold the structure, but 'struct kvm' is one generic struct,
and it has worked well on other ARCHs already in this way. Also,
the world switch frequecy is often low, for example, it is ~2000
when running kernel building load in VM from APM xgene KVM host,
so the effect is very small, and the difference can't be observed
in my test at all.

Cc: Dann Frazier 
Cc: Christoffer Dall 
Cc: Marc Zyngier 
Cc: kvm...@lists.cs.columbia.edu
Cc: kvm@vger.kernel.org
Signed-off-by: Ming Lei 
---
 arch/arm/include/asm/kvm_host.h   |  8 ++--
 arch/arm/kvm/Kconfig  | 11 ---
 arch/arm64/include/asm/kvm_host.h |  8 ++--
 arch/arm64/kvm/Kconfig| 11 ---
 include/kvm/arm_vgic.h|  6 +-
 virt/kvm/arm/vgic-v3.c|  2 +-
 6 files changed, 6 insertions(+), 40 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index dcba0fa..c8c226a 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -29,12 +29,6 @@
 
 #define __KVM_HAVE_ARCH_INTC_INITIALIZED
 
-#if defined(CONFIG_KVM_ARM_MAX_VCPUS)
-#define KVM_MAX_VCPUS CONFIG_KVM_ARM_MAX_VCPUS
-#else
-#define KVM_MAX_VCPUS 0
-#endif
-
 #define KVM_USER_MEM_SLOTS 32
 #define KVM_PRIVATE_MEM_SLOTS 4
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
@@ -44,6 +38,8 @@
 
 #include 
 
+#define KVM_MAX_VCPUS VGIC_V2_MAX_CPUS
+
 u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode);
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index bfb915d..210ecca 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@@ -45,15 +45,4 @@ config KVM_ARM_HOST
---help---
  Provides host support for ARM processors.
 
-config KVM_ARM_MAX_VCPUS
-   int "Number maximum supported virtual CPUs per VM"
-   depends on KVM_ARM_HOST
-   default 4
-   help
- Static number of max supported virtual CPUs per VM.
-
- If you choose a high number, the vcpu structures will be quite
- large, so only choose a reasonable number that you expect to
- actually use.
-
 endif # VIRTUALIZATION
diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 415938d..3fb58ea 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -30,12 +30,6 @@
 
 #define __KVM_HAVE_ARCH_INTC_INITIALIZED
 
-#if defined(CONFIG_KVM_ARM_MAX_VCPUS)
-#define KVM_MAX_VCPUS CONFIG_KVM_ARM_MAX_VCPUS
-#else
-#define KVM_MAX_VCPUS 0
-#endif
-
 #define KVM_USER_MEM_SLOTS 32
 #define KVM_PRIVATE_MEM_SLOTS 4
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
@@ -43,6 +37,8 @@
 #include 
 #include 
 
+#define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS
+
 #define KVM_VCPU_MAX_FEATURES 3
 
 int __attribute_const__ kvm_target_cpu(void);
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index bfffe8f..5c7e920 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -41,15 +41,4 @@ config KVM_ARM_HOST
---help---
  Provides host support for ARM processors.
 
-config KVM_ARM_MAX_VCPUS
-   int "Number maximum supported virtual CPUs per VM"
-   depends on KVM_ARM_HOST
-   default 4
-   help
- Static number of max supported virtual CPUs per VM.
-
- If you choose a high number, the vcpu structures will be quite
- large, so only choose a reasonable number that you expect to
- actually use.
-
 endif # VIRTUALIZATION
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index d901f1a..4e14dac 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -35,11 +35,7 @@
 #define VGIC_V3_MAX_LRS16
 #define VGIC_MAX_IRQS  1024
 #define VGIC_V2_MAX_CPUS   8
-
-/* Sanity checks... */
-#if (KVM_MAX_VCPUS > 255)
-#error Too many KVM VCPUs, the VGIC only supports up to 255 VCPUs for now
-#endif
+#define VGIC_V3_MAX_CPUS   255
 
 #if (VGIC_NR_IRQS_LEGACY & 31)
 #error "VGIC_NR_IRQS must be a multiple of 32"
diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c
index afbf925..7dd5d62 100644
--- a/virt/kvm/arm/vgic-v3.c
+++ b/virt/kvm/arm/vgic-v3.c
@@ -288,7 +288,7 @@ int vgic_v3_probe(struct device_node *vgic_node,
 
vgic->vctrl_base = NULL;

[PATCH v5 0/3] KVM: Dynamic Halt-Polling

2015-09-01 Thread Wanpeng Li
v4 -> v5:
 * set base case 10us and max poll time 500us
 * handle short/long halt, idea from David, many thanks David ;-)

v3 -> v4:
 * bring back grow vcpu->halt_poll_ns when interrupt arrives and shrinks
   when idle VCPU is detected 

v2 -> v3:
 * grow/shrink vcpu->halt_poll_ns by *halt_poll_ns_grow or /halt_poll_ns_shrink
 * drop the macros and hard coding the numbers in the param definitions
 * update the comments "5-7 us"
 * remove halt_poll_ns_max and use halt_poll_ns as the max halt_poll_ns time,
   vcpu->halt_poll_ns start at zero
 * drop the wrappers 
 * move the grow/shrink logic before "out:" w/ "if (waited)"

v1 -> v2:
 * change kvm_vcpu_block to read halt_poll_ns from the vcpu instead of 
   the module parameter
 * use the shrink/grow matrix which is suggested by David
 * set halt_poll_ns_max to 2ms

There is a downside of always-poll since poll is still happened for idle 
vCPUs which can waste cpu usage. This patchset add the ability to adjust 
halt_poll_ns dynamically, to grow halt_poll_ns when shot halt is detected,  
and to shrink halt_poll_ns when long halt is detected.

There are two new kernel parameters for changing the halt_poll_ns:
halt_poll_ns_grow and halt_poll_ns_shrink. 

no-poll  always-poll
dynamic-poll
---
Idle (nohz) vCPU %c0 0.15%0.3%0.2%  
Idle (250HZ) vCPU %c01.1% 4.6%~14%1.2%
TCP_RR latency   34us 27us26.7us

"Idle (X) vCPU %c0" is the percent of time the physical cpu spent in
c0 over 60 seconds (each vCPU is pinned to a pCPU). (nohz) means the
guest was tickless. (250HZ) means the guest was ticking at 250HZ.

The big win is with ticking operating systems. Running the linux guest
with nohz=off (and HZ=250), we save 3.4%~12.8% CPUs/second and get close 
to no-polling overhead levels by using the dynamic-poll. The savings
should be even higher for higher frequency ticks.


Wanpeng Li (3):
  KVM: make halt_poll_ns per-VCPU
  KVM: dynamic halt_poll_ns adjustment
  KVM: trace kvm_halt_poll_ns grow/shrink

 include/linux/kvm_host.h   |  1 +
 include/trace/events/kvm.h | 30 
 virt/kvm/kvm_main.c| 69 ++
 3 files changed, 95 insertions(+), 5 deletions(-)

-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 2/3] KVM: dynamic halt_poll_ns adjustment

2015-09-01 Thread Wanpeng Li
There is a downside of always-poll since poll is still happened for idle 
vCPUs which can waste cpu usage. This patch adds the ability to adjust 
halt_poll_ns dynamically, to grow halt_poll_ns when shot halt is detected,  
and to shrink halt_poll_ns when long halt is detected.

There are two new kernel parameters for changing the halt_poll_ns:
halt_poll_ns_grow and halt_poll_ns_shrink. 

no-poll  always-poll
dynamic-poll
---
Idle (nohz) vCPU %c0 0.15%0.3%0.2%  
Idle (250HZ) vCPU %c01.1% 4.6%~14%1.2%
TCP_RR latency   34us 27us26.7us

"Idle (X) vCPU %c0" is the percent of time the physical cpu spent in
c0 over 60 seconds (each vCPU is pinned to a pCPU). (nohz) means the
guest was tickless. (250HZ) means the guest was ticking at 250HZ.

The big win is with ticking operating systems. Running the linux guest
with nohz=off (and HZ=250), we save 3.4%~12.8% CPUs/second and get close 
to no-polling overhead levels by using the dynamic-poll. The savings
should be even higher for higher frequency ticks.

Suggested-by: David Matlack 
Signed-off-by: Wanpeng Li 
---
 virt/kvm/kvm_main.c | 60 ++---
 1 file changed, 57 insertions(+), 3 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c06e57c..2206cb0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -66,9 +66,18 @@
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
-static unsigned int halt_poll_ns;
+/* halt polling only reduces halt latency by 5-7 us, 500us is enough */
+static unsigned int halt_poll_ns = 50;
 module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
 
+/* Default doubles per-vcpu halt_poll_ns. */
+static unsigned int halt_poll_ns_grow = 2;
+module_param(halt_poll_ns_grow, int, S_IRUGO);
+
+/* Default resets per-vcpu halt_poll_ns . */
+static unsigned int halt_poll_ns_shrink;
+module_param(halt_poll_ns_shrink, int, S_IRUGO);
+
 /*
  * Ordering of locks:
  *
@@ -1907,6 +1916,31 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, 
gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
 
+static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
+{
+   int val = vcpu->halt_poll_ns;
+
+   /* 10us base */
+   if (val == 0 && halt_poll_ns_grow)
+   val = 1;
+   else
+   val *= halt_poll_ns_grow;
+
+   vcpu->halt_poll_ns = val;
+}
+
+static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
+{
+   int val = vcpu->halt_poll_ns;
+
+   if (halt_poll_ns_shrink == 0)
+   val = 0;
+   else
+   val /= halt_poll_ns_shrink;
+
+   vcpu->halt_poll_ns = val;
+}
+
 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
 {
if (kvm_arch_vcpu_runnable(vcpu)) {
@@ -1929,6 +1963,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
ktime_t start, cur;
DEFINE_WAIT(wait);
bool waited = false;
+   u64 poll_ns = 0, wait_ns = 0, block_ns = 0;
 
start = cur = ktime_get();
if (vcpu->halt_poll_ns) {
@@ -1941,12 +1976,17 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 */
if (kvm_vcpu_check_block(vcpu) < 0) {
++vcpu->stat.halt_successful_poll;
-   goto out;
+   break;
}
cur = ktime_get();
} while (single_task_running() && ktime_before(cur, stop));
}
 
+   if (ktime_after(cur, start)) {
+   poll_ns = ktime_to_ns(cur) - ktime_to_ns(start);
+   goto out;
+   }
+
for (;;) {
prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 
@@ -1959,9 +1999,23 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 
finish_wait(&vcpu->wq, &wait);
cur = ktime_get();
+   wait_ns = ktime_to_ns(cur) - ktime_to_ns(start);
 
 out:
-   trace_kvm_vcpu_wakeup(ktime_to_ns(cur) - ktime_to_ns(start), waited);
+   block_ns = poll_ns + wait_ns;
+
+   if (halt_poll_ns) {
+   if (block_ns <= vcpu->halt_poll_ns)
+   ;
+   /* we had a long block, shrink polling */
+   else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
+   shrink_halt_poll_ns(vcpu);
+   /* we had a short halt and our poll time is too small */
+   else if (vcpu->halt_poll_ns < halt_poll_ns && block_ns < 
halt_poll_ns)
+   grow_halt_poll_ns(vcpu);
+   }
+
+   trace_kvm_vcpu_wakeup(block_ns, waited);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_block);
 
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-inf

[PATCH v5 3/3] KVM: trace kvm_halt_poll_ns grow/shrink

2015-09-01 Thread Wanpeng Li
Tracepoint for dynamic halt_pool_ns, fired on every potential change.

Signed-off-by: Wanpeng Li 
---
 include/trace/events/kvm.h | 30 ++
 virt/kvm/kvm_main.c|  8 ++--
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index a44062d..75ddf80 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -356,6 +356,36 @@ TRACE_EVENT(
  __entry->address)
 );
 
+TRACE_EVENT(kvm_halt_poll_ns,
+   TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old),
+   TP_ARGS(grow, vcpu_id, new, old),
+
+   TP_STRUCT__entry(
+   __field(bool, grow)
+   __field(unsigned int, vcpu_id)
+   __field(int, new)
+   __field(int, old)
+   ),
+
+   TP_fast_assign(
+   __entry->grow   = grow;
+   __entry->vcpu_id= vcpu_id;
+   __entry->new= new;
+   __entry->old= old;
+   ),
+
+   TP_printk("vcpu %u: halt_pool_ns %d (%s %d)",
+   __entry->vcpu_id,
+   __entry->new,
+   __entry->grow ? "grow" : "shrink",
+   __entry->old)
+);
+
+#define trace_kvm_halt_poll_ns_grow(vcpu_id, new, old) \
+   trace_kvm_halt_poll_ns(true, vcpu_id, new, old)
+#define trace_kvm_halt_poll_ns_shrink(vcpu_id, new, old) \
+   trace_kvm_halt_poll_ns(false, vcpu_id, new, old)
+
 #endif
 
 #endif /* _TRACE_KVM_MAIN_H */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2206cb0..9d28232 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1918,8 +1918,9 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
 
 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
 {
-   int val = vcpu->halt_poll_ns;
+   int old, val;
 
+   old = val = vcpu->halt_poll_ns;
/* 10us base */
if (val == 0 && halt_poll_ns_grow)
val = 1;
@@ -1927,18 +1928,21 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
val *= halt_poll_ns_grow;
 
vcpu->halt_poll_ns = val;
+   trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
 }
 
 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
 {
-   int val = vcpu->halt_poll_ns;
+   int old, val;
 
+   old = val = vcpu->halt_poll_ns;
if (halt_poll_ns_shrink == 0)
val = 0;
else
val /= halt_poll_ns_shrink;
 
vcpu->halt_poll_ns = val;
+   trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
 }
 
 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 1/3] KVM: make halt_poll_ns per-VCPU

2015-09-01 Thread Wanpeng Li
Change halt_poll_ns into per-vCPU variable, seeded from module parameter,
to allow greater flexibility.

Signed-off-by: Wanpeng Li 
---
 include/linux/kvm_host.h | 1 +
 virt/kvm/kvm_main.c  | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 81089cf..1bef9e2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -242,6 +242,7 @@ struct kvm_vcpu {
int sigset_active;
sigset_t sigset;
struct kvm_vcpu_stat stat;
+   unsigned int halt_poll_ns;
 
 #ifdef CONFIG_HAS_IOMEM
int mmio_needed;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d8db2f8f..c06e57c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -217,6 +217,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, 
unsigned id)
vcpu->kvm = kvm;
vcpu->vcpu_id = id;
vcpu->pid = NULL;
+   vcpu->halt_poll_ns = 0;
init_waitqueue_head(&vcpu->wq);
kvm_async_pf_vcpu_init(vcpu);
 
@@ -1930,8 +1931,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
bool waited = false;
 
start = cur = ktime_get();
-   if (halt_poll_ns) {
-   ktime_t stop = ktime_add_ns(ktime_get(), halt_poll_ns);
+   if (vcpu->halt_poll_ns) {
+   ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
 
do {
/*
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 0/3] KVM: Dynamic Halt-Polling

2015-09-01 Thread Wanpeng Li

On 9/2/15 9:49 AM, David Matlack wrote:

On Tue, Sep 1, 2015 at 5:29 PM, Wanpeng Li  wrote:

On 9/2/15 7:24 AM, David Matlack wrote:

On Tue, Sep 1, 2015 at 3:58 PM, Wanpeng Li  wrote:



Why this can happen?

Ah, probably because I'm missing 9c8fd1ba220 (KVM: x86: optimize delivery
of TSC deadline timer interrupt). I don't think the edge case exists in
the latest kernel.


Yeah, hope we both(include Peter Kieser) can test against latest kvm tree to
avoid confusing. The reason to introduce the adaptive halt-polling toggle is
to handle the "edge case" as you mentioned above. So I think we can make
more efforts improve v4 instead. I will improve v4 to handle short halt
today. ;-)

That's fine. It's just easier to convey my ideas with a patch. FYI the
other reason for the toggle patch was to add the timer for kvm_vcpu_block,
which I think is the only way to get dynamic halt-polling right. Feel free
to work on top of v4!


I introduce your idea to shrink/grow poll time in v5 by detecting 
long/short halt and the performance looks good. Many thanks your help, 
David! ;-)


Regards,
Wanpeng Li





Did you test your patch against a windows guest?

I have not. I tested against a 250HZ linux guest to check how it performs
against a ticking guest. Presumably, windows should be the same, but at a
higher tick rate. Do you have a test for Windows?


I just test the idle vCPUs usage.


V4 for windows 10:

+-++---+
| | |
|
|  w/o halt-poll   |  w/ halt-poll  | dynamic(v4) halt-poll
|
+-++---+
| | |
|
|~2.1%|~3.0%  | ~2.4%
|
+-++---+

I'm not seeing the same results with v4. With a 250HZ ticking guest
I see 15% c0 with halt_poll_ns=200 and 1.27% with halt_poll_ns=0.
Are you running one vcpu per pcpu?

(The reason for the overhead: the new tracepoint shows each vcpu is
alternating between 0 and 500 us.)


V4  for linux guest:

+-++---+
| ||   |
|  w/o halt-poll  |  w/ halt-poll  | dynamic halt-poll |
+-++---+
| ||   |
|~0.9%|~1.8%   | ~1.2% |
+-++---+


Regards,
Wanpeng Li


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: commit 3c2e7f7de3 (KVM use NPT page attributes) causes boot failures

2015-09-01 Thread Markus Trippelsdorf
On 2015.09.02 at 06:31 +0800, Xiao Guangrong wrote:
> 
> 
> On 09/01/2015 09:56 PM, Markus Trippelsdorf wrote:
> > On 2015.09.01 at 21:00 +0800, Xiao Guangrong wrote:
> >>
> >> Did it trigger the BUG()/BUG_ON() in mtrr2protval()/fallback_mtrr_type()?
> >> If yes, could you please print the actual value out?
> >
> > It is the BUG() in fallback_mtrr_type(). I changed it to a printk and
> > it prints 1 for the value of mtrr.
> >
> >   MTRR_TYPE_WRCOMB 1
> >
> 
> Then I suspect pat is not enabled in your box, could you please check
> CONFIG_X86_PAT is selected in your .config file, pat is shown in
> /proc/cpuid, "nopat" kernel parameter is used, and dmesg | grep PAT.

No. PAT is of course enabled and booting is successful sometimes even
with the BUG() in allback_mtrr_type(). I suspect a setup (timing) issue.

markus@x4 linux % cat .config | grep  X86_PAT
CONFIG_X86_PAT=y
markus@x4 linux % dmesg | grep PAT
[0.00] x86/PAT: Configuration [0-7]: WB  WC  UC- UC  WB  WC  UC- WT  
markus@x4 linux % cat /proc/cpuinfo| grep pat
flags   : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov 
pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb 
rdtscp lm 3dnowext 3dnow constant_tsc rep_good nopl nonstop_tsc extd_apicid pni 
monitor cx16 popcnt lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a 
misalignsse 3dnowprefetch osvw ibs skinit wdt hw_pstate npt lbrv svm_lock 
nrip_save vmmcall
...

-- 
Markus
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 0/3] KVM: Dynamic Halt-Polling

2015-09-01 Thread David Matlack
On Tue, Sep 1, 2015 at 5:29 PM, Wanpeng Li  wrote:
> On 9/2/15 7:24 AM, David Matlack wrote:
>>
>> On Tue, Sep 1, 2015 at 3:58 PM, Wanpeng Li  wrote:

>>>
>>> Why this can happen?
>>
>> Ah, probably because I'm missing 9c8fd1ba220 (KVM: x86: optimize delivery
>> of TSC deadline timer interrupt). I don't think the edge case exists in
>> the latest kernel.
>
>
> Yeah, hope we both(include Peter Kieser) can test against latest kvm tree to
> avoid confusing. The reason to introduce the adaptive halt-polling toggle is
> to handle the "edge case" as you mentioned above. So I think we can make
> more efforts improve v4 instead. I will improve v4 to handle short halt
> today. ;-)

That's fine. It's just easier to convey my ideas with a patch. FYI the
other reason for the toggle patch was to add the timer for kvm_vcpu_block,
which I think is the only way to get dynamic halt-polling right. Feel free
to work on top of v4!

>

>>>
>>> Did you test your patch against a windows guest?
>>
>> I have not. I tested against a 250HZ linux guest to check how it performs
>> against a ticking guest. Presumably, windows should be the same, but at a
>> higher tick rate. Do you have a test for Windows?
>
>
> I just test the idle vCPUs usage.
>
>
> V4 for windows 10:
>
> +-++---+
> | | |
> |
> |  w/o halt-poll   |  w/ halt-poll  | dynamic(v4) halt-poll
> |
> +-++---+
> | | |
> |
> |~2.1%|~3.0%  | ~2.4%
> |
> +-++---+

I'm not seeing the same results with v4. With a 250HZ ticking guest
I see 15% c0 with halt_poll_ns=200 and 1.27% with halt_poll_ns=0.
Are you running one vcpu per pcpu?

(The reason for the overhead: the new tracepoint shows each vcpu is
alternating between 0 and 500 us.)

>
> V4  for linux guest:
>
> +-++---+
> | ||   |
> |  w/o halt-poll  |  w/ halt-poll  | dynamic halt-poll |
> +-++---+
> | ||   |
> |~0.9%|~1.8%   | ~1.2% |
> +-++---+
>
>
> Regards,
> Wanpeng Li
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 0/3] KVM: Dynamic Halt-Polling

2015-09-01 Thread Wanpeng Li

On 9/2/15 7:24 AM, David Matlack wrote:

On Tue, Sep 1, 2015 at 3:58 PM, Wanpeng Li  wrote:

On 9/2/15 6:34 AM, David Matlack wrote:

On Tue, Sep 1, 2015 at 3:30 PM, Wanpeng Li  wrote:

On 9/2/15 5:45 AM, David Matlack wrote:

On Thu, Aug 27, 2015 at 2:47 AM, Wanpeng Li 
wrote:

v3 -> v4:
* bring back grow vcpu->halt_poll_ns when interrupt arrives and
shrinks
  when idle VCPU is detected

v2 -> v3:
* grow/shrink vcpu->halt_poll_ns by *halt_poll_ns_grow or
/halt_poll_ns_shrink
* drop the macros and hard coding the numbers in the param
definitions
* update the comments "5-7 us"
* remove halt_poll_ns_max and use halt_poll_ns as the max
halt_poll_ns
time,
  vcpu->halt_poll_ns start at zero
* drop the wrappers
* move the grow/shrink logic before "out:" w/ "if (waited)"

I posted a patchset which adds dynamic poll toggling (on/off switch). I
think
this gives you a good place to build your dynamic growth patch on top.
The
toggling patch has close to zero overhead for idle VMs and equivalent
performance VMs doing message passing as always-poll. It's a patch
that's
been
in my queue for a few weeks but just haven't had the time to send out.
We
can
win even more with your patchset by only polling as much as we need (via
dynamic growth/shrink). It also gives us a better place to stand for
choosing
a default for halt_poll_ns. (We can run experiments and see how high
vcpu->halt_poll_ns tends to grow.)

The reason I posted a separate patch for toggling is because it adds
timers
to kvm_vcpu_block and deals with a weird edge case (kvm_vcpu_block can
get
called multiple times for one halt). To do dynamic poll adjustment


Why this can happen?

Ah, probably because I'm missing 9c8fd1ba220 (KVM: x86: optimize delivery
of TSC deadline timer interrupt). I don't think the edge case exists in
the latest kernel.


Yeah, hope we both(include Peter Kieser) can test against latest kvm 
tree to avoid confusing. The reason to introduce the adaptive 
halt-polling toggle is to handle the "edge case" as you mentioned above. 
So I think we can make more efforts improve v4 instead. I will improve 
v4 to handle short halt today. ;-)







correctly,
we have to time the length of each halt. Otherwise we hit some bad edge
cases:

 v3: v3 had lots of idle overhead. It's because vcpu->halt_poll_ns
grew
every
 time we had a long halt. So idle VMs looked like: 0 us -> 500 us ->
1
ms ->
 2 ms -> 4 ms -> 0 us. Ideally vcpu->halt_poll_ns should just stay at
0
when
 the halts are long.

 v4: v4 fixed the idle overhead problem but broke dynamic growth for
message
 passing VMs. Every time a VM did a short halt, vcpu->halt_poll_ns
would
grow.
 That means vcpu->halt_poll_ns will always be maxed out, even when
the
halt
 time is much less than the max.

I think we can fix both edge cases if we make grow/shrink decisions
based
on
the length of kvm_vcpu_block rather than the arrival of a guest
interrupt
during polling.

Some thoughts for dynamic growth:
 * Given Windows 10 timer tick (1 ms), let's set the maximum poll
time
to
   less than 1ms. 200 us has been a good value for always-poll. We
can
   probably go a bit higher once we have your patch. Maybe 500 us?


Did you test your patch against a windows guest?

I have not. I tested against a 250HZ linux guest to check how it performs
against a ticking guest. Presumably, windows should be the same, but at a
higher tick rate. Do you have a test for Windows?


I just test the idle vCPUs usage.


V4 for windows 10:

+-++---+
| | 
|   |
|  w/o halt-poll   |  w/ halt-poll  | dynamic(v4) 
halt-poll |

+-++---+
| | 
|   |
|~2.1%|~3.0%  | ~2.4%   
  |

+-++---+

V4  for linux guest:

+-++---+
| ||   |
|  w/o halt-poll  |  w/ halt-poll  | dynamic halt-poll |
+-++---+
| ||   |
|~0.9%|~1.8%   | ~1.2% |
+-++---+


Regards,
Wanpeng Li




 * The base case of dynamic growth (the first grow() after being at
0)
should
   be small. 500 us is too big. When I run TCP_RR in my guest I see
poll
times
   of < 10 us. TCP_RR is on the lower-end of message passing workload
latency,
   so 10 us would be a good base case.


How to get your TCP_RR benchmark?

Regards,
Wanpeng Li

Install the netperf package, or build from here:
http://www.netperf.org/netperf/DownloadNetperf.html

In the vm:

#

Re: [PATCH v4 0/3] KVM: Dynamic Halt-Polling

2015-09-01 Thread David Matlack
On Tue, Sep 1, 2015 at 3:58 PM, Wanpeng Li  wrote:
> On 9/2/15 6:34 AM, David Matlack wrote:
>>
>> On Tue, Sep 1, 2015 at 3:30 PM, Wanpeng Li  wrote:
>>>
>>> On 9/2/15 5:45 AM, David Matlack wrote:

 On Thu, Aug 27, 2015 at 2:47 AM, Wanpeng Li 
 wrote:
>
> v3 -> v4:
>* bring back grow vcpu->halt_poll_ns when interrupt arrives and
> shrinks
>  when idle VCPU is detected
>
> v2 -> v3:
>* grow/shrink vcpu->halt_poll_ns by *halt_poll_ns_grow or
> /halt_poll_ns_shrink
>* drop the macros and hard coding the numbers in the param
> definitions
>* update the comments "5-7 us"
>* remove halt_poll_ns_max and use halt_poll_ns as the max
> halt_poll_ns
> time,
>  vcpu->halt_poll_ns start at zero
>* drop the wrappers
>* move the grow/shrink logic before "out:" w/ "if (waited)"

 I posted a patchset which adds dynamic poll toggling (on/off switch). I
 think
 this gives you a good place to build your dynamic growth patch on top.
 The
 toggling patch has close to zero overhead for idle VMs and equivalent
 performance VMs doing message passing as always-poll. It's a patch
 that's
 been
 in my queue for a few weeks but just haven't had the time to send out.
 We
 can
 win even more with your patchset by only polling as much as we need (via
 dynamic growth/shrink). It also gives us a better place to stand for
 choosing
 a default for halt_poll_ns. (We can run experiments and see how high
 vcpu->halt_poll_ns tends to grow.)

 The reason I posted a separate patch for toggling is because it adds
 timers
 to kvm_vcpu_block and deals with a weird edge case (kvm_vcpu_block can
 get
 called multiple times for one halt). To do dynamic poll adjustment
>
>
> Why this can happen?

Ah, probably because I'm missing 9c8fd1ba220 (KVM: x86: optimize delivery
of TSC deadline timer interrupt). I don't think the edge case exists in
the latest kernel.

>
>
 correctly,
 we have to time the length of each halt. Otherwise we hit some bad edge
 cases:

 v3: v3 had lots of idle overhead. It's because vcpu->halt_poll_ns
 grew
 every
 time we had a long halt. So idle VMs looked like: 0 us -> 500 us ->
 1
 ms ->
 2 ms -> 4 ms -> 0 us. Ideally vcpu->halt_poll_ns should just stay at
 0
 when
 the halts are long.

 v4: v4 fixed the idle overhead problem but broke dynamic growth for
 message
 passing VMs. Every time a VM did a short halt, vcpu->halt_poll_ns
 would
 grow.
 That means vcpu->halt_poll_ns will always be maxed out, even when
 the
 halt
 time is much less than the max.

 I think we can fix both edge cases if we make grow/shrink decisions
 based
 on
 the length of kvm_vcpu_block rather than the arrival of a guest
 interrupt
 during polling.

 Some thoughts for dynamic growth:
 * Given Windows 10 timer tick (1 ms), let's set the maximum poll
 time
 to
   less than 1ms. 200 us has been a good value for always-poll. We
 can
   probably go a bit higher once we have your patch. Maybe 500 us?
>
>
> Did you test your patch against a windows guest?

I have not. I tested against a 250HZ linux guest to check how it performs
against a ticking guest. Presumably, windows should be the same, but at a
higher tick rate. Do you have a test for Windows?

>

 * The base case of dynamic growth (the first grow() after being at
 0)
 should
   be small. 500 us is too big. When I run TCP_RR in my guest I see
 poll
 times
   of < 10 us. TCP_RR is on the lower-end of message passing workload
 latency,
   so 10 us would be a good base case.
>>>
>>>
>>> How to get your TCP_RR benchmark?
>>>
>>> Regards,
>>> Wanpeng Li
>>
>> Install the netperf package, or build from here:
>> http://www.netperf.org/netperf/DownloadNetperf.html
>>
>> In the vm:
>>
>> # ./netserver
>> # ./netperf -t TCP_RR
>>
>> Be sure to use an SMP guest (we want TCP_RR to be a cross-core message
>> passing workload in order to test halt-polling).
>
>
> Ah, ok, I use the same benchmark as yours.
>
> Regards,
> Wanpeng Li
>
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 0/3] KVM: Dynamic Halt-Polling

2015-09-01 Thread Wanpeng Li

On 9/2/15 6:34 AM, David Matlack wrote:

On Tue, Sep 1, 2015 at 3:30 PM, Wanpeng Li  wrote:

On 9/2/15 5:45 AM, David Matlack wrote:

On Thu, Aug 27, 2015 at 2:47 AM, Wanpeng Li 
wrote:

v3 -> v4:
   * bring back grow vcpu->halt_poll_ns when interrupt arrives and shrinks
 when idle VCPU is detected

v2 -> v3:
   * grow/shrink vcpu->halt_poll_ns by *halt_poll_ns_grow or
/halt_poll_ns_shrink
   * drop the macros and hard coding the numbers in the param definitions
   * update the comments "5-7 us"
   * remove halt_poll_ns_max and use halt_poll_ns as the max halt_poll_ns
time,
 vcpu->halt_poll_ns start at zero
   * drop the wrappers
   * move the grow/shrink logic before "out:" w/ "if (waited)"

I posted a patchset which adds dynamic poll toggling (on/off switch). I
think
this gives you a good place to build your dynamic growth patch on top. The
toggling patch has close to zero overhead for idle VMs and equivalent
performance VMs doing message passing as always-poll. It's a patch that's
been
in my queue for a few weeks but just haven't had the time to send out. We
can
win even more with your patchset by only polling as much as we need (via
dynamic growth/shrink). It also gives us a better place to stand for
choosing
a default for halt_poll_ns. (We can run experiments and see how high
vcpu->halt_poll_ns tends to grow.)

The reason I posted a separate patch for toggling is because it adds
timers
to kvm_vcpu_block and deals with a weird edge case (kvm_vcpu_block can get
called multiple times for one halt). To do dynamic poll adjustment


Why this can happen?


correctly,
we have to time the length of each halt. Otherwise we hit some bad edge
cases:

v3: v3 had lots of idle overhead. It's because vcpu->halt_poll_ns grew
every
time we had a long halt. So idle VMs looked like: 0 us -> 500 us -> 1
ms ->
2 ms -> 4 ms -> 0 us. Ideally vcpu->halt_poll_ns should just stay at 0
when
the halts are long.

v4: v4 fixed the idle overhead problem but broke dynamic growth for
message
passing VMs. Every time a VM did a short halt, vcpu->halt_poll_ns would
grow.
That means vcpu->halt_poll_ns will always be maxed out, even when the
halt
time is much less than the max.

I think we can fix both edge cases if we make grow/shrink decisions based
on
the length of kvm_vcpu_block rather than the arrival of a guest interrupt
during polling.

Some thoughts for dynamic growth:
* Given Windows 10 timer tick (1 ms), let's set the maximum poll time
to
  less than 1ms. 200 us has been a good value for always-poll. We can
  probably go a bit higher once we have your patch. Maybe 500 us?


Did you test your patch against a windows guest?



* The base case of dynamic growth (the first grow() after being at 0)
should
  be small. 500 us is too big. When I run TCP_RR in my guest I see poll
times
  of < 10 us. TCP_RR is on the lower-end of message passing workload
latency,
  so 10 us would be a good base case.


How to get your TCP_RR benchmark?

Regards,
Wanpeng Li

Install the netperf package, or build from here:
http://www.netperf.org/netperf/DownloadNetperf.html

In the vm:

# ./netserver
# ./netperf -t TCP_RR

Be sure to use an SMP guest (we want TCP_RR to be a cross-core message
passing workload in order to test halt-polling).


Ah, ok, I use the same benchmark as yours.

Regards,
Wanpeng Li


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: ppc: Fix size of the PSPB register

2015-09-01 Thread Benjamin Herrenschmidt
On Wed, 2015-09-02 at 08:45 +1000, Paul Mackerras wrote:
> On Wed, Sep 02, 2015 at 08:25:05AM +1000, Benjamin Herrenschmidt
> wrote:
> > On Tue, 2015-09-01 at 23:41 +0200, Thomas Huth wrote:
> > > The size of the Problem State Priority Boost Register is only
> > > 32 bits, so let's change the type of the corresponding variable
> > > accordingly to avoid future trouble.
> > 
> > It's not future trouble, it's broken today for LE and this should
> > fix
> > it BUT 
> 
> No, it's broken today for BE hosts, which will always see 0 for the
> PSPB register value.  LE hosts are fine.

0 or PSPB << 32 ?

> > The asm accesses it using lwz/stw and C accesses it as a ulong. On
> > LE
> > that will mean that userspace will see the value << 32
> 
> No, that will happen on BE, and since KVM_REG_PPC_PSPB says it's a
> 32-bit register, we'll just pass 0 back to userspace when it reads
> it.

Ah ok, I missed that bit about KVM_REG_PPC_PSPB

> > Now "fixing" it might break migration if that field is already
> > stored/loaded in its "broken" form. We may have to keep the
> > "broken"
> > behaviour and document that qemu sees a value shifted by 32.
> 
> It will be being set to 0 on BE hosts across migration today
> (fortunately 0 is a benign value for PSPB).  If we fix this on both
> the source and destination host, then the value will get migrated
> across correctly.

Ok, I missed the part where KVM_REG_PPC_PSPB passed it down as a 32
-bit. That means Thomas patch should work indeed.

> I think Thomas's patch is fine, it just needs a stronger patch
> description saying that it fixes an actual bug.

Right.

Cheers,
Ben.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: ppc: Fix size of the PSPB register

2015-09-01 Thread Paul Mackerras
On Wed, Sep 02, 2015 at 08:25:05AM +1000, Benjamin Herrenschmidt wrote:
> On Tue, 2015-09-01 at 23:41 +0200, Thomas Huth wrote:
> > The size of the Problem State Priority Boost Register is only
> > 32 bits, so let's change the type of the corresponding variable
> > accordingly to avoid future trouble.
> 
> It's not future trouble, it's broken today for LE and this should fix
> it BUT 

No, it's broken today for BE hosts, which will always see 0 for the
PSPB register value.  LE hosts are fine.

> The asm accesses it using lwz/stw and C accesses it as a ulong. On LE
> that will mean that userspace will see the value << 32

No, that will happen on BE, and since KVM_REG_PPC_PSPB says it's a
32-bit register, we'll just pass 0 back to userspace when it reads it.

> Now "fixing" it might break migration if that field is already
> stored/loaded in its "broken" form. We may have to keep the "broken"
> behaviour and document that qemu sees a value shifted by 32.

It will be being set to 0 on BE hosts across migration today
(fortunately 0 is a benign value for PSPB).  If we fix this on both
the source and destination host, then the value will get migrated
across correctly.

I think Thomas's patch is fine, it just needs a stronger patch
description saying that it fixes an actual bug.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [GIT PULL] Early batch of KVM changes for 4.3 merge window

2015-09-01 Thread Xiao Guangrong



On 09/02/2015 01:03 AM, Paolo Bonzini wrote:



diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fb16a8ea3dee..3c745f3abde8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3309,13 +3309,13 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, 
u64 addr, u64 *sptep)

walk_shadow_page_lockless_begin(vcpu);

-   for (shadow_walk_init(&iterator, vcpu, addr), root = iterator.level;
+   for (shadow_walk_init(&iterator, vcpu, addr),
+leaf = root = iterator.level;
 shadow_walk_okay(&iterator);
 __shadow_walk_next(&iterator, spte)) {
-   leaf = iterator.level;
spte = mmu_spte_get_lockless(iterator.sptep);

-   sptes[leaf - 1] = spte;
+   sptes[--leaf] = spte;

if (!is_shadow_present_pte(spte))
break;
@@ -3329,7 +3329,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 
addr, u64 *sptep)
if (reserved) {
pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump 
hierarchy:\n",
   __func__, addr);
-   while (root >= leaf) {
+   while (root > leaf) {
pr_err("-- spte 0x%llx level %d.\n",
   sptes[root - 1], root);
root--;


But honestly I haven't even compiled it yet.  Xiao, what do you think?



It looks good to me!

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: commit 3c2e7f7de3 (KVM use NPT page attributes) causes boot failures

2015-09-01 Thread Xiao Guangrong



On 09/01/2015 09:56 PM, Markus Trippelsdorf wrote:

On 2015.09.01 at 21:00 +0800, Xiao Guangrong wrote:


Did it trigger the BUG()/BUG_ON() in mtrr2protval()/fallback_mtrr_type()?
If yes, could you please print the actual value out?


It is the BUG() in fallback_mtrr_type(). I changed it to a printk and
it prints 1 for the value of mtrr.

  MTRR_TYPE_WRCOMB 1



Then I suspect pat is not enabled in your box, could you please check
CONFIG_X86_PAT is selected in your .config file, pat is shown in
/proc/cpuid, "nopat" kernel parameter is used, and dmesg | grep PAT.

I will post a fix if the suspect is right.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 0/3] KVM: Dynamic Halt-Polling

2015-09-01 Thread David Matlack
On Tue, Sep 1, 2015 at 3:30 PM, Wanpeng Li  wrote:
> On 9/2/15 5:45 AM, David Matlack wrote:
>>
>> On Thu, Aug 27, 2015 at 2:47 AM, Wanpeng Li 
>> wrote:
>>>
>>> v3 -> v4:
>>>   * bring back grow vcpu->halt_poll_ns when interrupt arrives and shrinks
>>> when idle VCPU is detected
>>>
>>> v2 -> v3:
>>>   * grow/shrink vcpu->halt_poll_ns by *halt_poll_ns_grow or
>>> /halt_poll_ns_shrink
>>>   * drop the macros and hard coding the numbers in the param definitions
>>>   * update the comments "5-7 us"
>>>   * remove halt_poll_ns_max and use halt_poll_ns as the max halt_poll_ns
>>> time,
>>> vcpu->halt_poll_ns start at zero
>>>   * drop the wrappers
>>>   * move the grow/shrink logic before "out:" w/ "if (waited)"
>>
>> I posted a patchset which adds dynamic poll toggling (on/off switch). I
>> think
>> this gives you a good place to build your dynamic growth patch on top. The
>> toggling patch has close to zero overhead for idle VMs and equivalent
>> performance VMs doing message passing as always-poll. It's a patch that's
>> been
>> in my queue for a few weeks but just haven't had the time to send out. We
>> can
>> win even more with your patchset by only polling as much as we need (via
>> dynamic growth/shrink). It also gives us a better place to stand for
>> choosing
>> a default for halt_poll_ns. (We can run experiments and see how high
>> vcpu->halt_poll_ns tends to grow.)
>>
>> The reason I posted a separate patch for toggling is because it adds
>> timers
>> to kvm_vcpu_block and deals with a weird edge case (kvm_vcpu_block can get
>> called multiple times for one halt). To do dynamic poll adjustment
>> correctly,
>> we have to time the length of each halt. Otherwise we hit some bad edge
>> cases:
>>
>>v3: v3 had lots of idle overhead. It's because vcpu->halt_poll_ns grew
>> every
>>time we had a long halt. So idle VMs looked like: 0 us -> 500 us -> 1
>> ms ->
>>2 ms -> 4 ms -> 0 us. Ideally vcpu->halt_poll_ns should just stay at 0
>> when
>>the halts are long.
>>
>>v4: v4 fixed the idle overhead problem but broke dynamic growth for
>> message
>>passing VMs. Every time a VM did a short halt, vcpu->halt_poll_ns would
>> grow.
>>That means vcpu->halt_poll_ns will always be maxed out, even when the
>> halt
>>time is much less than the max.
>>
>> I think we can fix both edge cases if we make grow/shrink decisions based
>> on
>> the length of kvm_vcpu_block rather than the arrival of a guest interrupt
>> during polling.
>>
>> Some thoughts for dynamic growth:
>>* Given Windows 10 timer tick (1 ms), let's set the maximum poll time
>> to
>>  less than 1ms. 200 us has been a good value for always-poll. We can
>>  probably go a bit higher once we have your patch. Maybe 500 us?
>>
>>* The base case of dynamic growth (the first grow() after being at 0)
>> should
>>  be small. 500 us is too big. When I run TCP_RR in my guest I see poll
>> times
>>  of < 10 us. TCP_RR is on the lower-end of message passing workload
>> latency,
>>  so 10 us would be a good base case.
>
>
> How to get your TCP_RR benchmark?
>
> Regards,
> Wanpeng Li

Install the netperf package, or build from here:
http://www.netperf.org/netperf/DownloadNetperf.html

In the vm:

# ./netserver
# ./netperf -t TCP_RR

Be sure to use an SMP guest (we want TCP_RR to be a cross-core message
passing workload in order to test halt-polling).

>
>
>>> v1 -> v2:
>>>   * change kvm_vcpu_block to read halt_poll_ns from the vcpu instead of
>>> the module parameter
>>>   * use the shrink/grow matrix which is suggested by David
>>>   * set halt_poll_ns_max to 2ms
>>>
>>> There is a downside of halt_poll_ns since poll is still happen for idle
>>> VCPU which can waste cpu usage. This patchset add the ability to adjust
>>> halt_poll_ns dynamically, grows halt_poll_ns if an interrupt arrives and
>>> shrinks halt_poll_ns when idle VCPU is detected.
>>>
>>> There are two new kernel parameters for changing the halt_poll_ns:
>>> halt_poll_ns_grow and halt_poll_ns_shrink.
>>>
>>>
>>> Test w/ high cpu overcommit ratio, pin vCPUs, and the halt_poll_ns of
>>> halt-poll is the default 50ns, the max halt_poll_ns of dynamic
>>> halt-poll is 2ms. Then watch the %C0 in the dump of Powertop tool.
>>> The test method is almost from David.
>>>
>>> +-++---+
>>> | ||   |
>>> |  w/o halt-poll  |  w/ halt-poll  | dynamic halt-poll |
>>> +-++---+
>>> | ||   |
>>> |~0.9%|~1.8%   | ~1.2% |
>>> +-++---+
>>>
>>> The always halt-poll will increase ~0.9% cpu usage for idle vCPUs and the
>>> dynamic halt-poll drop it to ~0.3% which means that reduce the 67%
>>> overhead
>>> introduced by always halt-poll.
>>>
>>> Wanpeng Li (3):
>>>KVM: make halt_poll_ns p

Re: [PATCH] KVM: ppc: Fix size of the PSPB register

2015-09-01 Thread Benjamin Herrenschmidt
On Wed, 2015-09-02 at 08:24 +1000, Paul Mackerras wrote:
> On Tue, Sep 01, 2015 at 11:41:18PM +0200, Thomas Huth wrote:
> > The size of the Problem State Priority Boost Register is only
> > 32 bits, so let's change the type of the corresponding variable
> > accordingly to avoid future trouble.
> 
> Since we're already using lwz/stw in the assembly code in
> book3s_hv_rmhandlers.S, this is actually a bug fix, isn't it?
> How did you find it?  Did you observe a failure of some kind, or did
> you just find it by code inspection?

Won't the fix break migration ? Unless qemu doens't migrate it ...

> Paul.
> ___
> Linuxppc-dev mailing list
> linuxppc-...@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 0/3] KVM: Dynamic Halt-Polling

2015-09-01 Thread Wanpeng Li

On 9/2/15 5:45 AM, David Matlack wrote:

On Thu, Aug 27, 2015 at 2:47 AM, Wanpeng Li  wrote:

v3 -> v4:
  * bring back grow vcpu->halt_poll_ns when interrupt arrives and shrinks
when idle VCPU is detected

v2 -> v3:
  * grow/shrink vcpu->halt_poll_ns by *halt_poll_ns_grow or /halt_poll_ns_shrink
  * drop the macros and hard coding the numbers in the param definitions
  * update the comments "5-7 us"
  * remove halt_poll_ns_max and use halt_poll_ns as the max halt_poll_ns time,
vcpu->halt_poll_ns start at zero
  * drop the wrappers
  * move the grow/shrink logic before "out:" w/ "if (waited)"

I posted a patchset which adds dynamic poll toggling (on/off switch). I think
this gives you a good place to build your dynamic growth patch on top. The
toggling patch has close to zero overhead for idle VMs and equivalent
performance VMs doing message passing as always-poll. It's a patch that's been
in my queue for a few weeks but just haven't had the time to send out. We can
win even more with your patchset by only polling as much as we need (via
dynamic growth/shrink). It also gives us a better place to stand for choosing
a default for halt_poll_ns. (We can run experiments and see how high
vcpu->halt_poll_ns tends to grow.)

The reason I posted a separate patch for toggling is because it adds timers
to kvm_vcpu_block and deals with a weird edge case (kvm_vcpu_block can get
called multiple times for one halt). To do dynamic poll adjustment correctly,
we have to time the length of each halt. Otherwise we hit some bad edge cases:

   v3: v3 had lots of idle overhead. It's because vcpu->halt_poll_ns grew every
   time we had a long halt. So idle VMs looked like: 0 us -> 500 us -> 1 ms ->
   2 ms -> 4 ms -> 0 us. Ideally vcpu->halt_poll_ns should just stay at 0 when
   the halts are long.

   v4: v4 fixed the idle overhead problem but broke dynamic growth for message
   passing VMs. Every time a VM did a short halt, vcpu->halt_poll_ns would grow.
   That means vcpu->halt_poll_ns will always be maxed out, even when the halt
   time is much less than the max.

I think we can fix both edge cases if we make grow/shrink decisions based on
the length of kvm_vcpu_block rather than the arrival of a guest interrupt
during polling.

Some thoughts for dynamic growth:
   * Given Windows 10 timer tick (1 ms), let's set the maximum poll time to
 less than 1ms. 200 us has been a good value for always-poll. We can
 probably go a bit higher once we have your patch. Maybe 500 us?

   * The base case of dynamic growth (the first grow() after being at 0) should
 be small. 500 us is too big. When I run TCP_RR in my guest I see poll times
 of < 10 us. TCP_RR is on the lower-end of message passing workload latency,
 so 10 us would be a good base case.


How to get your TCP_RR benchmark?

Regards,
Wanpeng Li


v1 -> v2:
  * change kvm_vcpu_block to read halt_poll_ns from the vcpu instead of
the module parameter
  * use the shrink/grow matrix which is suggested by David
  * set halt_poll_ns_max to 2ms

There is a downside of halt_poll_ns since poll is still happen for idle
VCPU which can waste cpu usage. This patchset add the ability to adjust
halt_poll_ns dynamically, grows halt_poll_ns if an interrupt arrives and
shrinks halt_poll_ns when idle VCPU is detected.

There are two new kernel parameters for changing the halt_poll_ns:
halt_poll_ns_grow and halt_poll_ns_shrink.


Test w/ high cpu overcommit ratio, pin vCPUs, and the halt_poll_ns of
halt-poll is the default 50ns, the max halt_poll_ns of dynamic
halt-poll is 2ms. Then watch the %C0 in the dump of Powertop tool.
The test method is almost from David.

+-++---+
| ||   |
|  w/o halt-poll  |  w/ halt-poll  | dynamic halt-poll |
+-++---+
| ||   |
|~0.9%|~1.8%   | ~1.2% |
+-++---+

The always halt-poll will increase ~0.9% cpu usage for idle vCPUs and the
dynamic halt-poll drop it to ~0.3% which means that reduce the 67% overhead
introduced by always halt-poll.

Wanpeng Li (3):
   KVM: make halt_poll_ns per-VCPU
   KVM: dynamic halt_poll_ns adjustment
   KVM: trace kvm_halt_poll_ns grow/shrink

  include/linux/kvm_host.h   |  1 +
  include/trace/events/kvm.h | 30 
  virt/kvm/kvm_main.c| 50 +++---
  3 files changed, 78 insertions(+), 3 deletions(-)
--
1.9.1



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: ppc: Fix size of the PSPB register

2015-09-01 Thread Benjamin Herrenschmidt
On Tue, 2015-09-01 at 23:41 +0200, Thomas Huth wrote:
> The size of the Problem State Priority Boost Register is only
> 32 bits, so let's change the type of the corresponding variable
> accordingly to avoid future trouble.

It's not future trouble, it's broken today for LE and this should fix
it BUT 

The asm accesses it using lwz/stw and C accesses it as a ulong. On LE
that will mean that userspace will see the value << 32

Now "fixing" it might break migration if that field is already
stored/loaded in its "broken" form. We may have to keep the "broken"
behaviour and document that qemu sees a value shifted by 32.

Cheers,
Ben.

> Signed-off-by: Thomas Huth 
> ---
>  arch/powerpc/include/asm/kvm_host.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_host.h
> b/arch/powerpc/include/asm/kvm_host.h
> index d91f65b..c825f3a 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -473,7 +473,7 @@ struct kvm_vcpu_arch {
>   ulong ciabr;
>   ulong cfar;
>   ulong ppr;
> - ulong pspb;
> + u32 pspb;
>   ulong fscr;
>   ulong shadow_fscr;
>   ulong ebbhr;

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: ppc: Fix size of the PSPB register

2015-09-01 Thread Paul Mackerras
On Tue, Sep 01, 2015 at 11:41:18PM +0200, Thomas Huth wrote:
> The size of the Problem State Priority Boost Register is only
> 32 bits, so let's change the type of the corresponding variable
> accordingly to avoid future trouble.

Since we're already using lwz/stw in the assembly code in
book3s_hv_rmhandlers.S, this is actually a bug fix, isn't it?
How did you find it?  Did you observe a failure of some kind, or did
you just find it by code inspection?

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 0/3] KVM: Dynamic Halt-Polling

2015-09-01 Thread David Matlack
On Thu, Aug 27, 2015 at 2:47 AM, Wanpeng Li  wrote:
> v3 -> v4:
>  * bring back grow vcpu->halt_poll_ns when interrupt arrives and shrinks
>when idle VCPU is detected
>
> v2 -> v3:
>  * grow/shrink vcpu->halt_poll_ns by *halt_poll_ns_grow or 
> /halt_poll_ns_shrink
>  * drop the macros and hard coding the numbers in the param definitions
>  * update the comments "5-7 us"
>  * remove halt_poll_ns_max and use halt_poll_ns as the max halt_poll_ns time,
>vcpu->halt_poll_ns start at zero
>  * drop the wrappers
>  * move the grow/shrink logic before "out:" w/ "if (waited)"

I posted a patchset which adds dynamic poll toggling (on/off switch). I think
this gives you a good place to build your dynamic growth patch on top. The
toggling patch has close to zero overhead for idle VMs and equivalent
performance VMs doing message passing as always-poll. It's a patch that's been
in my queue for a few weeks but just haven't had the time to send out. We can
win even more with your patchset by only polling as much as we need (via
dynamic growth/shrink). It also gives us a better place to stand for choosing
a default for halt_poll_ns. (We can run experiments and see how high
vcpu->halt_poll_ns tends to grow.)

The reason I posted a separate patch for toggling is because it adds timers
to kvm_vcpu_block and deals with a weird edge case (kvm_vcpu_block can get
called multiple times for one halt). To do dynamic poll adjustment correctly,
we have to time the length of each halt. Otherwise we hit some bad edge cases:

  v3: v3 had lots of idle overhead. It's because vcpu->halt_poll_ns grew every
  time we had a long halt. So idle VMs looked like: 0 us -> 500 us -> 1 ms ->
  2 ms -> 4 ms -> 0 us. Ideally vcpu->halt_poll_ns should just stay at 0 when
  the halts are long.

  v4: v4 fixed the idle overhead problem but broke dynamic growth for message
  passing VMs. Every time a VM did a short halt, vcpu->halt_poll_ns would grow.
  That means vcpu->halt_poll_ns will always be maxed out, even when the halt
  time is much less than the max.

I think we can fix both edge cases if we make grow/shrink decisions based on
the length of kvm_vcpu_block rather than the arrival of a guest interrupt
during polling.

Some thoughts for dynamic growth:
  * Given Windows 10 timer tick (1 ms), let's set the maximum poll time to
less than 1ms. 200 us has been a good value for always-poll. We can
probably go a bit higher once we have your patch. Maybe 500 us?

  * The base case of dynamic growth (the first grow() after being at 0) should
be small. 500 us is too big. When I run TCP_RR in my guest I see poll times
of < 10 us. TCP_RR is on the lower-end of message passing workload latency,
so 10 us would be a good base case.

>
> v1 -> v2:
>  * change kvm_vcpu_block to read halt_poll_ns from the vcpu instead of
>the module parameter
>  * use the shrink/grow matrix which is suggested by David
>  * set halt_poll_ns_max to 2ms
>
> There is a downside of halt_poll_ns since poll is still happen for idle
> VCPU which can waste cpu usage. This patchset add the ability to adjust
> halt_poll_ns dynamically, grows halt_poll_ns if an interrupt arrives and
> shrinks halt_poll_ns when idle VCPU is detected.
>
> There are two new kernel parameters for changing the halt_poll_ns:
> halt_poll_ns_grow and halt_poll_ns_shrink.
>
>
> Test w/ high cpu overcommit ratio, pin vCPUs, and the halt_poll_ns of
> halt-poll is the default 50ns, the max halt_poll_ns of dynamic
> halt-poll is 2ms. Then watch the %C0 in the dump of Powertop tool.
> The test method is almost from David.
>
> +-++---+
> | ||   |
> |  w/o halt-poll  |  w/ halt-poll  | dynamic halt-poll |
> +-++---+
> | ||   |
> |~0.9%|~1.8%   | ~1.2% |
> +-++---+
>
> The always halt-poll will increase ~0.9% cpu usage for idle vCPUs and the
> dynamic halt-poll drop it to ~0.3% which means that reduce the 67% overhead
> introduced by always halt-poll.
>
> Wanpeng Li (3):
>   KVM: make halt_poll_ns per-VCPU
>   KVM: dynamic halt_poll_ns adjustment
>   KVM: trace kvm_halt_poll_ns grow/shrink
>
>  include/linux/kvm_host.h   |  1 +
>  include/trace/events/kvm.h | 30 
>  virt/kvm/kvm_main.c| 50 
> +++---
>  3 files changed, 78 insertions(+), 3 deletions(-)
> --
> 1.9.1
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] kvm: adaptive halt-polling toggle

2015-09-01 Thread David Matlack
This patch removes almost all of the overhead of polling for idle VCPUs
by disabling polling for long halts. The length of the previous halt
is used as a predictor for the current halt:

  if (length of previous halt < halt_poll_ns): poll for halt_poll_ns
  else: don't poll

This tends to work well in practice. For VMs running Message Passing
workloads, all halts are short and so the VCPU should always poll. When
a VCPU is idle, all halts are long and so the VCPU should never halt.
Experimental results on an IvyBridge host show adaptive toggling gets
close to the best of both worlds.

   no-poll always-polladaptive-toggle
-
Idle (nohz) VCPU %c0   0.120.32   0.15
Idle (250HZ) VCPU %c0  1.226.35   1.27
TCP_RR latency 39 us   25 us  25 us

(3.16 Linux guest, halt_poll_ns=20)

The big win is with ticking operating systems. Running the linux guest
with nohz=off (and HZ=250), we save 5% CPU/second and get close to
no-polling overhead levels by using the adaptive toggle. The savings
should be even higher for higher frequency ticks.

Signed-off-by: David Matlack 
---
 include/trace/events/kvm.h |  23 ++
 virt/kvm/kvm_main.c| 110 ++---
 2 files changed, 97 insertions(+), 36 deletions(-)

diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index a44062d..34e0b11 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -38,22 +38,27 @@ TRACE_EVENT(kvm_userspace_exit,
 );
 
 TRACE_EVENT(kvm_vcpu_wakeup,
-   TP_PROTO(__u64 ns, bool waited),
-   TP_ARGS(ns, waited),
+   TP_PROTO(bool poll, bool success, __u64 poll_ns, __u64 wait_ns),
+   TP_ARGS(poll, success, poll_ns, wait_ns),
 
TP_STRUCT__entry(
-   __field(__u64,  ns  )
-   __field(bool,   waited  )
+   __field( bool,  poll)
+   __field( bool,  success )
+   __field(__u64,  poll_ns )
+   __field(__u64,  wait_ns )
),
 
TP_fast_assign(
-   __entry->ns = ns;
-   __entry->waited = waited;
+   __entry->poll   = poll;
+   __entry->success= success;
+   __entry->poll_ns= poll_ns;
+   __entry->wait_ns= wait_ns;
),
 
-   TP_printk("%s time %lld ns",
- __entry->waited ? "wait" : "poll",
- __entry->ns)
+   TP_printk("%s %s, poll ns %lld, wait ns %lld",
+ __entry->poll ? "poll" : "wait",
+ __entry->success ? "success" : "fail",
+ __entry->poll_ns, __entry->wait_ns)
 );
 
 #if defined(CONFIG_HAVE_KVM_IRQFD)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 977ffb1..3a66694 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -66,7 +66,8 @@
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
-static unsigned int halt_poll_ns;
+/* The maximum amount of time a vcpu will poll for interrupts while halted. */
+static unsigned int halt_poll_ns = 20;
 module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
 
 /*
@@ -1907,6 +1908,7 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, 
gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
 
+/* This sets KVM_REQ_UNHALT if an interrupt arrives. */
 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
 {
if (kvm_arch_vcpu_runnable(vcpu)) {
@@ -1921,47 +1923,101 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
return 0;
 }
 
-/*
- * The vCPU has executed a HLT instruction with in-kernel mode enabled.
- */
-void kvm_vcpu_block(struct kvm_vcpu *vcpu)
+static void
+update_vcpu_block_predictor(struct kvm_vcpu *vcpu, u64 poll_ns, u64 wait_ns)
 {
-   ktime_t start, cur;
-   DEFINE_WAIT(wait);
-   bool waited = false;
-
-   start = cur = ktime_get();
-   if (vcpu->halt_poll_ns) {
-   ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
-
-   do {
-   /*
-* This sets KVM_REQ_UNHALT if an interrupt
-* arrives.
-*/
-   if (kvm_vcpu_check_block(vcpu) < 0) {
-   ++vcpu->stat.halt_successful_poll;
-   goto out;
-   }
-   cur = ktime_get();
-   } while (single_task_running() && ktime_before(cur, stop));
+   u64 block_ns = poll_ns + wait_ns;
+
+   if (block_ns <= vcpu->halt_poll_ns)
+   return;
+
+   if (block_ns < halt_poll_ns)
+   /* we had a short block a

[PATCH 1/2] KVM: make halt_poll_ns per-VCPU

2015-09-01 Thread David Matlack
From: Wanpeng Li 

Change halt_poll_ns into per-VCPU variable, seeded from module parameter,
to allow greater flexibility.

Signed-off-by: Wanpeng Li 
---
 include/linux/kvm_host.h | 1 +
 virt/kvm/kvm_main.c  | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 05e99b8..382cbef 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -241,6 +241,7 @@ struct kvm_vcpu {
int sigset_active;
sigset_t sigset;
struct kvm_vcpu_stat stat;
+   unsigned int halt_poll_ns;
 
 #ifdef CONFIG_HAS_IOMEM
int mmio_needed;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8b8a444..977ffb1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -217,6 +217,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, 
unsigned id)
vcpu->kvm = kvm;
vcpu->vcpu_id = id;
vcpu->pid = NULL;
+   vcpu->halt_poll_ns = 0;
init_waitqueue_head(&vcpu->wq);
kvm_async_pf_vcpu_init(vcpu);
 
@@ -1930,8 +1931,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
bool waited = false;
 
start = cur = ktime_get();
-   if (halt_poll_ns) {
-   ktime_t stop = ktime_add_ns(ktime_get(), halt_poll_ns);
+   if (vcpu->halt_poll_ns) {
+   ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
 
do {
/*
-- 
2.5.0.457.gab17608

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] KVM: ppc: Fix size of the PSPB register

2015-09-01 Thread Thomas Huth
The size of the Problem State Priority Boost Register is only
32 bits, so let's change the type of the corresponding variable
accordingly to avoid future trouble.

Signed-off-by: Thomas Huth 
---
 arch/powerpc/include/asm/kvm_host.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index d91f65b..c825f3a 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -473,7 +473,7 @@ struct kvm_vcpu_arch {
ulong ciabr;
ulong cfar;
ulong ppr;
-   ulong pspb;
+   u32 pspb;
ulong fscr;
ulong shadow_fscr;
ulong ebbhr;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/2] Adaptive halt-polling toggle

2015-09-01 Thread David Matlack
This patchset adds a dynamic on/off switch for polling. This patchset
gets good performance on its own for both idle and Message Passing
workloads.

   no-poll always-polladaptive-toggle
-
Idle (nohz) VCPU %c0   0.120.32   0.15
Idle (250HZ) VCPU %c0  1.226.35   1.27
TCP_RR latency 39 us   25 us  25 us

(3.16 Linux guest, halt_poll_ns=20)

"Idle (X) VCPU %c0" is the percent of time the physical cpu spent in
c0 over 60 seconds (each VCPU is pinned to a PCPU). (nohz) means the
guest was tickless. (250HZ) means the guest was ticking at 250HZ.

The big win is with ticking operating systems. Running the linux guest
with nohz=off (and HZ=250), we save 5% CPUs/second and get close to
no-polling overhead levels by using the adaptive toggle. The savings
should be even higher for higher frequency ticks.

Since we get low idle overhead with polling now, halt_poll_ns defaults
to 20, instead of 0. We can increase halt_poll_ns a bit more once
we have dynamic halt-polling length adjustments (Wanpeng's patch). We
should however keep halt_poll_ns below 1 ms since that is the tick
frequency used by windows.

David Matlack (1):
  kvm: adaptive halt-polling toggle

Wanpeng Li (1):
  KVM: make halt_poll_ns per-VCPU

 include/linux/kvm_host.h   |   1 +
 include/trace/events/kvm.h |  23 ++
 virt/kvm/kvm_main.c| 111 ++---
 3 files changed, 99 insertions(+), 36 deletions(-)

-- 
2.5.0.457.gab17608

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 100671] vmwrite error in vmx_vcpu_run

2015-09-01 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=100671

Paolo Bonzini  changed:

   What|Removed |Added

 Status|NEW |RESOLVED
 CC||bonz...@gnu.org
 Resolution|--- |CODE_FIX

--- Comment #6 from Paolo Bonzini  ---
The faulty patch, 1cde2930e15473cb4dd7e5a07d83e605a969bd6e, was never in a
stable release.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [GIT PULL] Early batch of KVM changes for 4.3 merge window

2015-09-01 Thread Paolo Bonzini


On 01/09/2015 07:45, Xiao Guangrong wrote:
> 
> 
> Actually i triggered this warning in my another box and posted a patch
> to fix it which can be found at:
> http://lkml.iu.edu/hypermail/linux/kernel/1508.2/02771.html
> I guess Paolo is currently busy with KVM forum so the patch has not been
> reviewed yet.

Currently I'm busy with the Dolomites, actually.  I'll send a fix
together with the PPC+ARM pull request.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [GIT PULL] Early batch of KVM changes for 4.3 merge window

2015-09-01 Thread Paolo Bonzini


On 01/09/2015 02:47, Linus Torvalds wrote:
> Hmm:
> 
> On Fri, Aug 14, 2015 at 4:57 PM, Paolo Bonzini  wrote:
>>
>> Xiao Guangrong (9):
>>   KVM: MMU: fully check zero bits for sptes
> 
> The above commit causes an annoying new compiler warning.
> 
> The warning is bogus ("variable 'leaf' possibly uninitialized"),
> because the use of the variable is protected by the 'bool reserved'
> flag, but gcc is apparently not smart enough to understand that.

Unfortunately it doesn't reproduce on all compiler versions.

Something like this should do it:

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fb16a8ea3dee..3c745f3abde8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3309,13 +3309,13 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, 
u64 addr, u64 *sptep)
 
walk_shadow_page_lockless_begin(vcpu);
 
-   for (shadow_walk_init(&iterator, vcpu, addr), root = iterator.level;
+   for (shadow_walk_init(&iterator, vcpu, addr),
+leaf = root = iterator.level;
 shadow_walk_okay(&iterator);
 __shadow_walk_next(&iterator, spte)) {
-   leaf = iterator.level;
spte = mmu_spte_get_lockless(iterator.sptep);
 
-   sptes[leaf - 1] = spte;
+   sptes[--leaf] = spte;
 
if (!is_shadow_present_pte(spte))
break;
@@ -3329,7 +3329,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 
addr, u64 *sptep)
if (reserved) {
pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump 
hierarchy:\n",
   __func__, addr);
-   while (root >= leaf) {
+   while (root > leaf) {
pr_err("-- spte 0x%llx level %d.\n",
   sptes[root - 1], root);
root--;


But honestly I haven't even compiled it yet.  Xiao, what do you think?

Paolo

> Since bogus warnings cause people to possibly ignore the *real*
> warnings, this should be fixed. Maybe the code should get rid of that
> 'reserved' flag, and instead initialize "leaf" to zero, and use that
> as the flag instead (since zero isn't a valid level)? That would
> actually avoid an extra variable, and would get rid of the warning.
> 
> Hmm?
> 
>  Linus
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] KVM: arm64: Implement vGICv3 distributor and redistributor access from userspace

2015-09-01 Thread Peter Maydell
On 1 September 2015 at 14:52, Andre Przywara  wrote:
> Also the GIC specification says that everything must be accessible with
> 32-bit accesses. Correct me if I am wrong on this, but vCPUs are not
> supposed to run while you are getting/setting VGIC registers, right? So
> there shouldn't be any issues with non-atomic accesses to 64-bit
> registers, which means you could just go ahead and do everything in
> 32-bit only. This would also help with supporting 32-bit userland and/or
> kernel later.

We should design the userspace API based on the natural size
of the registers in the GICv3 spec, not on what happens to
be convenient for the kernel to implement. There's only one
kernel but there can be multiple userspace consumers of the API...

I don't see any reason why a 32-bit userland wouldn't be able
to handle 64-bit accesses via the KVM_SET/GET_DEVICE_ATTR
ioctls, or am I missing something?

thanks
-- PMM
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH 1/3] KVM: arm64: Implement vGICv3 distributor and redistributor access from userspace

2015-09-01 Thread Pavel Fedin
 Hello!

> I agree on this, actually I consider this dangerous. Currently the
> memory behind addr in QEMU (hw/intc/arm_gic_kvm.c:kvm_arm_gic_get() for
> instance) is only uint32_t, so you have to take care to provide uint64_t
> backing for those registers, which means that there must be a match
> between the register size the kernel knows and the size userland thinks
> of. So I'd rather see the access size controlled by userland

 Ok, i will implement it this way.

> Also the GIC specification says that everything must be accessible with
> 32-bit accesses. Correct me if I am wrong on this, but vCPUs are not
> supposed to run while you are getting/setting VGIC registers, right?

 Right.

> So there shouldn't be any issues with non-atomic accesses to 64-bit
> registers, which means you could just go ahead and do everything in
> 32-bit only.

 I thought about it too, it's inconvenient. In the userland you would have to 
do two accesses and
merge the result. It's just tedious. After all this API is not emulating guest 
behavior, it's just
for reading/writing GIC state.
 So on next respin i'll add size bit.

Kind regards,
Pavel Fedin
Expert Engineer
Samsung Electronics Research center Russia

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] KVM: arm64: Implement accessors for vGIC CPU interface registers

2015-09-01 Thread Christoffer Dall
On Tue, Sep 01, 2015 at 04:09:18PM +0300, Pavel Fedin wrote:
>  Hello!
> 
> > Have you thought about proper locking/serializing of access to the GIC
> > state in these accessor functions?  
> 
>  I am in the process of rewriting the whole thing, and i came to this point.
>  What kind of locking  would you expect ? It's a CPU interface, it does not 
> affect state of any
> other vCPUs. And, since i am getting/setting its registers, i assume that the 
> vCPU is not running.
> Well, i added the check. What next?
> 
I think we make some assumptions throughout the vgic code that only the
vcpu itself touches the state of the registers.  Maybe there is no need
for additional locking, but I'd sleep better at night if I knew that
whoever implemented save/restore logic had thought about concurrency.

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: commit 3c2e7f7de3 (KVM use NPT page attributes) causes boot failures

2015-09-01 Thread Markus Trippelsdorf
On 2015.09.01 at 21:00 +0800, Xiao Guangrong wrote:
> 
> Did it trigger the BUG()/BUG_ON() in mtrr2protval()/fallback_mtrr_type()?
> If yes, could you please print the actual value out?

It is the BUG() in fallback_mtrr_type(). I changed it to a printk and
it prints 1 for the value of mtrr.

 MTRR_TYPE_WRCOMB 1

-- 
Markus
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] KVM: arm64: Implement vGICv3 distributor and redistributor access from userspace

2015-09-01 Thread Andre Przywara
Hi Pavel,

...

>> diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c
>> index e661e7f..b3847e1 100644
>> --- a/virt/kvm/arm/vgic-v3-emul.c
>> +++ b/virt/kvm/arm/vgic-v3-emul.c
...
>> @@ -1000,40 +1102,95 @@ static void vgic_v3_destroy(struct kvm_device *dev)
>>  kfree(dev);
>>  }
>>  
>> +static u32 vgic_v3_get_reg_size(struct kvm_device_attr *attr)
>> +{
>> +u32 offset;
>> +
>> +switch (attr->group) {
>> +case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
>> +offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
>> +if (offset >= GICD_IROUTER && offset <= 0x7FD8)
> 
> eh, 0x7FD8 ?
> 
>> +return 8;
>> +else
>> +return 4;
>> +break;
>> +
>> +case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
>> +offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
>> +if ((offset == GICR_TYPER) ||
>> +(offset >= GICR_SETLPIR && offset <= GICR_INVALLR))
>> +return 8;
>> +else
>> +return 4;
>> +break;
>> +
>> +default:
>> +return -ENXIO;
>> +}
>> +}
> 
> this feels wrong.

I agree on this, actually I consider this dangerous. Currently the
memory behind addr in QEMU (hw/intc/arm_gic_kvm.c:kvm_arm_gic_get() for
instance) is only uint32_t, so you have to take care to provide uint64_t
backing for those registers, which means that there must be a match
between the register size the kernel knows and the size userland thinks
of. So I'd rather see the access size controlled by userland, probably
using Christoffer's suggestion below.

Also the GIC specification says that everything must be accessible with
32-bit accesses. Correct me if I am wrong on this, but vCPUs are not
supposed to run while you are getting/setting VGIC registers, right? So
there shouldn't be any issues with non-atomic accesses to 64-bit
registers, which means you could just go ahead and do everything in
32-bit only. This would also help with supporting 32-bit userland and/or
kernel later.

Cheers,
Andre.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH 3/3] KVM: arm64: Implement accessors for vGIC CPU interface registers

2015-09-01 Thread Pavel Fedin
 Hello!

> Have you thought about proper locking/serializing of access to the GIC
> state in these accessor functions?  

 I am in the process of rewriting the whole thing, and i came to this point.
 What kind of locking  would you expect ? It's a CPU interface, it does not 
affect state of any
other vCPUs. And, since i am getting/setting its registers, i assume that the 
vCPU is not running.
Well, i added the check. What next?

Kind regards,
Pavel Fedin
Expert Engineer
Samsung Electronics Research center Russia


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: commit 3c2e7f7de3 (KVM use NPT page attributes) causes boot failures

2015-09-01 Thread Xiao Guangrong



On 09/01/2015 06:04 PM, Markus Trippelsdorf wrote:

On 2015.09.01 at 10:56 +0200, Ingo Molnar wrote:


* Markus Trippelsdorf  wrote:

As I wrote in my other reply. The boot failure is nondeterministic (boot
succeeds roughly every sixth time). So the bisection and the patch is
just bogus (,but the boot failure is real).

Sorry.


No problem. Please let us know if any of these commits does turn out to be the
culprit. (Which is always a possibility.)


I'm pretty sure commit 3c2e7f7de3 is the culprit.

commit 3c2e7f7de3240216042b61073803b61b9b3cfb22
Author: Paolo Bonzini 
Date:   Tue Jul 7 14:32:17 2015 +0200

 KVM: SVM: use NPT page attributes

I've booted ten times in a row successfully with the following patch:

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 74d825716f4f..3190173a575f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -989,7 +989,7 @@ static __init int svm_hardware_setup(void)
} else
kvm_disable_tdp();

-   build_mtrr2protval();
+// build_mtrr2protval();
return 0;

  err:

Paolo, your commit causes nondeterministic boot failure on my machine.
It sometimes crashes early with the following backtrace:



Did it trigger the BUG()/BUG_ON() in mtrr2protval()/fallback_mtrr_type()?
If yes, could you please print the actual value out?

BTW, you may change BUG() to WARN() to get the print info more easier.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[no subject]

2015-09-01 Thread Wei Xu
subscribe kvm
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: commit 3c2e7f7de3 (KVM use NPT page attributes) causes boot failures

2015-09-01 Thread Markus Trippelsdorf
On 2015.09.01 at 10:56 +0200, Ingo Molnar wrote:
> 
> * Markus Trippelsdorf  wrote:
> > As I wrote in my other reply. The boot failure is nondeterministic (boot
> > succeeds roughly every sixth time). So the bisection and the patch is
> > just bogus (,but the boot failure is real).
> > 
> > Sorry.
> 
> No problem. Please let us know if any of these commits does turn out to be 
> the 
> culprit. (Which is always a possibility.)

I'm pretty sure commit 3c2e7f7de3 is the culprit.

commit 3c2e7f7de3240216042b61073803b61b9b3cfb22
Author: Paolo Bonzini 
Date:   Tue Jul 7 14:32:17 2015 +0200

KVM: SVM: use NPT page attributes

I've booted ten times in a row successfully with the following patch:

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 74d825716f4f..3190173a575f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -989,7 +989,7 @@ static __init int svm_hardware_setup(void)
} else
kvm_disable_tdp();
 
-   build_mtrr2protval();
+// build_mtrr2protval();
return 0;
 
 err:

Paolo, your commit causes nondeterministic boot failure on my machine.
It sometimes crashes early with the following backtrace:

map_vsyscall
kvm_arch_hardware_setup
map_vsyscall
kvm_init
map_vsyscall
do_one_initcall
kernel_init_freeable
rest_init
kernel_init
ret_from_fork
rest_init

RIP: svm_hardware_setup 

-- 
Markus
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 103851] qemu windows guest hangs on 100% cpu usage

2015-09-01 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=103851

Wanpeng Li  changed:

   What|Removed |Added

 CC||wanpeng...@hotmail.com

--- Comment #1 from Wanpeng Li  ---
If this can be reproduced against latest kvm tree?

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH v2 14/18] nvdimm: support NFIT_CMD_IMPLEMENTED function

2015-09-01 Thread Stefan Hajnoczi
On Mon, Aug 31, 2015 at 02:51:50PM +0800, Xiao Guangrong wrote:
> 
> 
> On 08/28/2015 08:01 PM, Stefan Hajnoczi wrote:
> >On Wed, Aug 26, 2015 at 06:46:35PM +0800, Xiao Guangrong wrote:
> >>On 08/26/2015 12:23 AM, Stefan Hajnoczi wrote:
> >>>On Fri, Aug 14, 2015 at 10:52:07PM +0800, Xiao Guangrong wrote:
>   static void dsm_write(void *opaque, hwaddr addr,
> uint64_t val, unsigned size)
>   {
> +struct MemoryRegion *dsm_ram_mr = opaque;
> +struct dsm_buffer *dsm;
> +struct dsm_out *out;
> +void *buf;
> +
>   assert(val == NOTIFY_VALUE);
> >>>
> >>>The guest should not be able to cause an abort(3).  If val !=
> >>>NOTIFY_VALUE we can do nvdebug() and then return.
> >>
> >>The ACPI code and emulation code both are from qemu, if that happens,
> >>it's really a bug, aborting the VM is better than throwing a debug
> >>message under this case to avoid potential data corruption.
> >
> >abort(3) is dangerous because it can create a core dump.  If a malicious
> >guest triggers this repeatedly it could consume a lot of disk space and
> >I/O or CPU while performing the core dumps.
> >
> >We cannot trust anything inside the guest, even if the guest code comes
> >from QEMU because a malicious guest can still read/write to the same
> >hardware registers.
> >
> 
> Completely agree with you. :)
> 
> How about use exit{1} instead of abort() to kill the VM?

Most devices on a physical machine do not power off or reset the machine
in case of error.

I think it's good to follow that model and avoid killing the VM.
Otherwise nested virtualization or userspace drivers can take down the
whole VM.

Stefan
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 08/18] nvdimm: init backend memory mapping and config data area

2015-09-01 Thread Stefan Hajnoczi
On Mon, Aug 31, 2015 at 02:23:43PM +0800, Xiao Guangrong wrote:
> 
> Hi Stefan,
> 
> On 08/28/2015 07:58 PM, Stefan Hajnoczi wrote:
> 
> >
> +goto do_unmap;
> +}
> +
> +nvdimm->device_index = new_device_index();
> +sprintf(name, "NVDIMM-%d", nvdimm->device_index);
> +memory_region_init_ram_ptr(&nvdimm->mr, OBJECT(dev), name, 
> nvdimm_size,
> +   buf);
> >>>
> >>>How is the autogenerated name used?
> >>>
> >>>Why not just use "pc-nvdimm.memory"?
> >>
> >>Ah. Just for debug proposal :) and i am not sure if a name used for multiple
> >>MRs (MemoryRegion) is a good idea.
> >
> >Other devices use a constant name too (git grep
> >memory_region_init_ram_ptr) so it seems to be okay.  The unique thing is
> >the OBJECT(dev) which differs for each NVDIMM instance.
> >
> 
> When I was digging into live migration code, i noticed that the same MR name 
> may
> cause the name "idstr", please refer to qemu_ram_set_idstr().
> 
> Since nvdimm devices do not have parent-bus, it will trigger the abort() in 
> that
> function.

I see.  The other devices that use a constant name are on a bus so the
abort doesn't trigger.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH V3 2/3] kvm: don't register wildcard MMIO EVENTFD on two buses

2015-09-01 Thread Michael S. Tsirkin
On Tue, Sep 01, 2015 at 04:22:22PM +0800, Jason Wang wrote:
> 
> 
> On 09/01/2015 02:54 PM, Michael S. Tsirkin wrote:
> > On Tue, Sep 01, 2015 at 12:47:36PM +0800, Jason Wang wrote:
> >>
> >> On 09/01/2015 12:31 PM, Michael S. Tsirkin wrote:
> >>> On Tue, Sep 01, 2015 at 11:33:43AM +0800, Jason Wang wrote:
>  On 08/31/2015 07:33 PM, Michael S. Tsirkin wrote:
> > On Mon, Aug 31, 2015 at 04:03:59PM +0800, Jason Wang wrote:
> >>> On 08/31/2015 03:29 PM, Michael S. Tsirkin wrote:
> > Thinking more about this, invoking the 0-length write after
> > the != 0 length one would be better: it would mean 
> > we only
> > handle the userspace MMIO like this.
> > Right.
> >
> > Using current unittest. This patch is about 2.9% slower than 
> > before, and
> > invoking 0-length write after is still 1.1% slower 
> > (mmio-datamatch-eventfd).
> >
> > /patch/result/-+%/
> > /base/2957/0/
> > /V3/3043/+2.9%/
> > /V3+invoking != 0 length first/2990/+1.1%/
> >
> > So looks like the best method is not searching 
> > KVM_FAST_MMIO_BUS during
> > KVM_MMIO_BUS. Instead, let userspace to register both datamatch 
> > and
> > wildcard in this case. Does this sound good to you?
> > No - we can't change userspace.
> >>> Actually, the change was as simple as following. So I don't get the
> >>> reason why.
> > Because it's too late - we committed to a specific userspace ABI
> > when this was merged in kernel, we must maintain it.
>  Ok ( Though I don't think it has real users for this now because it was
>  actually broken).
> >>> It actually worked most of the time - you only trigger a use after free
> >>> on deregister.
> >>>
> >> It doesn't work for amd and intel machine without ept.
> > I thought it does :(
> >
> > Even if I thought yours is a good API (and I don't BTW - it's exposing
> > internal implementation details) it's too late to change it.
>  I believe we should document the special treatment in kernel of zero
>  length mmio eventfd in api.txt? If yes, is this an exposing? If not, how
>  can userspace know the advantages of this and use it? For better API,
>  probably we need another new flag just for fast mmio and obsolete
>  current one by failing the assigning for zero length mmio eventfd.
> >>> I sent a patch to update api.txt already as part of
> >>> kvm: add KVM_CAP_IOEVENTFD_PF capability.
> >>> I should probably split it out.
> >>>
> >>> Sorry, I don't think the api change you propose makes sense - just fix the
> >>> crash in the existing one.
> >>>
> >> Ok, so I believe the fix should go:
> >>
> >> - having two ioeventfds when we want to assign zero length mmio eventfd
> > You mean the in-kernel data structures?
> 
> Yes.
> 
> >
> >> - change the kvm_io_bus_sort_cmp() and can handle zero length correctly
> > This one's for amd/non ept, right? I'd rather we implemented the
> > fast mmio optimization for these.
> 
> Agree, but we'd better fix it and backport it to stable first?

I would say fix it upstream first. Worry about stable later.  And I
don't see a lot of value in adding a temporary hack - it's not too much
work to just do the optimal thing directly.

But I won't nack a temporary solution if you insist.

-- 
MST
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH V3 2/3] kvm: don't register wildcard MMIO EVENTFD on two buses

2015-09-01 Thread Jason Wang


On 09/01/2015 02:54 PM, Michael S. Tsirkin wrote:
> On Tue, Sep 01, 2015 at 12:47:36PM +0800, Jason Wang wrote:
>>
>> On 09/01/2015 12:31 PM, Michael S. Tsirkin wrote:
>>> On Tue, Sep 01, 2015 at 11:33:43AM +0800, Jason Wang wrote:
 On 08/31/2015 07:33 PM, Michael S. Tsirkin wrote:
> On Mon, Aug 31, 2015 at 04:03:59PM +0800, Jason Wang wrote:
>>> On 08/31/2015 03:29 PM, Michael S. Tsirkin wrote:
> Thinking more about this, invoking the 0-length write after
> the != 0 length one would be better: it would mean we 
> only
> handle the userspace MMIO like this.
> Right.
>
> Using current unittest. This patch is about 2.9% slower than 
> before, and
> invoking 0-length write after is still 1.1% slower 
> (mmio-datamatch-eventfd).
>
> /patch/result/-+%/
> /base/2957/0/
> /V3/3043/+2.9%/
> /V3+invoking != 0 length first/2990/+1.1%/
>
> So looks like the best method is not searching KVM_FAST_MMIO_BUS 
> during
> KVM_MMIO_BUS. Instead, let userspace to register both datamatch 
> and
> wildcard in this case. Does this sound good to you?
> No - we can't change userspace.
>>> Actually, the change was as simple as following. So I don't get the
>>> reason why.
> Because it's too late - we committed to a specific userspace ABI
> when this was merged in kernel, we must maintain it.
 Ok ( Though I don't think it has real users for this now because it was
 actually broken).
>>> It actually worked most of the time - you only trigger a use after free
>>> on deregister.
>>>
>> It doesn't work for amd and intel machine without ept.
> I thought it does :(
>
> Even if I thought yours is a good API (and I don't BTW - it's exposing
> internal implementation details) it's too late to change it.
 I believe we should document the special treatment in kernel of zero
 length mmio eventfd in api.txt? If yes, is this an exposing? If not, how
 can userspace know the advantages of this and use it? For better API,
 probably we need another new flag just for fast mmio and obsolete
 current one by failing the assigning for zero length mmio eventfd.
>>> I sent a patch to update api.txt already as part of
>>> kvm: add KVM_CAP_IOEVENTFD_PF capability.
>>> I should probably split it out.
>>>
>>> Sorry, I don't think the api change you propose makes sense - just fix the
>>> crash in the existing one.
>>>
>> Ok, so I believe the fix should go:
>>
>> - having two ioeventfds when we want to assign zero length mmio eventfd
> You mean the in-kernel data structures?

Yes.

>
>> - change the kvm_io_bus_sort_cmp() and can handle zero length correctly
> This one's for amd/non ept, right? I'd rather we implemented the
> fast mmio optimization for these.

Agree, but we'd better fix it and backport it to stable first?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 103851] New: qemu windows guest hangs on 100% cpu usage

2015-09-01 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=103851

Bug ID: 103851
   Summary: qemu windows guest hangs on 100% cpu usage
   Product: Virtualization
   Version: unspecified
Kernel Version: 3.13.6
  Hardware: Intel
OS: Linux
  Tree: Mainline
Status: NEW
  Severity: blocking
  Priority: P1
 Component: kvm
  Assignee: virtualization_...@kernel-bugs.osdl.org
  Reporter: biaoxian...@163.com
Regression: No

hi:
I have two VM , one is winXP Prefessional SP3 32bit, another one is
WindowsServer2008 Enterprise SP2 64bit.
When I hot reboot winXP in guest OS, it'll hangs on progress bar, and all the
vcpu thread in qemu is 100% usage.
There are no message in kernel log when it happened, I try to rebuild kvm and
add some debug info, I found the cpu exit reason is
EXIT_REASON_PAUSE_INSTRUCTION.
It seems like all the vcpu always in spinlock waiting. I'm not sure whether
it's qemu's bug or kvm's.
Any help would be appreciated.

How reproducible:
WinXP: seems always.
WinServer2008: rare.

Steps to Reproduce:
winXP: hot reboot the xp guest os, hot reboot is necessary.
WinServer2008: not sure, I didn't do anything, it just happened.

The different between WinXP and WInServer2008:
1. When WinXP hangs, the boot progress bar is rolling, I think that vnc is work
fine.
2. When WinServer2008 hangs, the vnc show the last screen and the screen won't
change anything include system time.
3. When the VM hangs , if I execute "virsh suspend vm-name" and "virsh resume
vm-name", the WinServer2008 will change to normal , and work fine not hangs
anymore. But WinXP not change anything, still hangs.

qemu version:
QEMU emulator version 1.5.0, Copyright (c) 2003-2008 Fabrice Bellard

host info:
Intel(R) Xeon(R) CPU E5-2620 0 @ 2.00GHz
Ubuntu 12.04 LTS \n \l
Linux cvknode2026 3.13.6 #1 SMP Fri Dec 12 09:17:35 CST 2014 x86_64 x86_64
x86_64 GNU/Linux

 qemu command line (guest OS XP):
root 7124 1178 7.6 7750360 3761644 ? Sl 14:02 435:23 /usr/bin/kvm -name x -S
-machine pc-i440fx-1.5,accel=kvm,usb=off,system=windows -cpu
qemu64,hv_relaxed,hv_spinlocks=0x2000 -m 6144 -smp
12,maxcpus=72,sockets=12,cores=6,threads=1 -uuid
d3832129-f77d-4b21-bbf7-fd337f53e572 -no-user-config -nodefaults -chardev
socket,id=charmonitor,path=/var/lib/libvirt/qemu/x.monitor,server,nowait -mon
chardev=charmonitor,id=monitor,mode=control -rtc
base=localtime,clock=vm,driftfix=slew -no-hpet -no-shutdown -device
piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 -device
usb-ehci,id=ehci,bus=pci.0,addr=0x4 -device
virtio-serial-pci,id=virtio-serial0,bus=pci.0,addr=0x5 -drive
file=/vms/images/sn1-of-ff.qcow2,if=none,id=drive-ide0-0-0,format=qcow2,cache=directsync
-device ide-hd,bus=ide.0,unit=0,drive=drive-ide0-0-0,id=ide0-0-0,bootindex=1
-drive if=none,id=drive-ide0-1-1,readonly=on,format=raw -device
ide-cd,bus=ide.1,unit=1,drive=drive-ide0-1-1,id=ide0-1-1,bootindex=2 -netdev
tap,fd=24,id=hostnet0 -device
rtl8139,netdev=hostnet0,id=net0,mac=0c:da:41:1d:f8:40,bus=pci.0,addr=0x3
-chardev pty,id=charserial0 -device isa-serial,chardev=charserial0,id=serial0
-chardev
socket,id=charchannel0,path=/var/lib/libvirt/qemu/x.agent,server,nowait -device
virtserialport,bus=virtio-serial0.0,nr=1,chardev=charchannel0,id=channel0,name=org.qemu.guest_agent.0
-device usb-tablet,id=input0,bus=usb.0 -vnc 0.0.0.0:0 -device
VGA,id=video0,bus=pci.0,addr=0x2 -device
virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x6

 all qemu thread (guest OS XP):
root@cvknode2026:/proc/7124/task# top -d 1 -H -p 7124
top - 14:37:05 up 7 days, 4:07, 1 user, load average: 10.71, 10.90, 10.19
Tasks: 14 total, 12 running, 2 sleeping, 0 stopped, 0 zombie
Cpu(s): 38.8%us, 11.2%sy, 0.0%ni, 50.0%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%st
Mem: 49159888k total, 35665128k used, 13494760k free, 436312k buffers
Swap: 8803324k total, 0k used, 8803324k free, 28595100k cached

  PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ P SWAP WCHAN COMMAND
 7130 root 20 0 7568m 3.6g 6628 R 101 7.7 33:43.48 3 3.8g - kvm
 7132 root 20 0 7568m 3.6g 6628 R 101 7.7 33:43.13 1 3.8g - kvm
 7133 root 20 0 7568m 3.6g 6628 R 101 7.7 33:42.70 6 3.8g - kvm
 7135 root 20 0 7568m 3.6g 6628 R 101 7.7 33:42.33 11 3.8g - kvm
 7137 root 20 0 7568m 3.6g 6628 R 101 7.7 33:42.59 17 3.8g - kvm
 7126 root 20 0 7568m 3.6g 6628 R 100 7.7 34:06.76 4 3.8g - kvm
 7127 root 20 0 7568m 3.6g 6628 R 100 7.7 33:44.14 8 3.8g - kvm
 7128 root 20 0 7568m 3.6g 6628 R 100 7.7 33:43.64 13 3.8g - kvm
 7129 root 20 0 7568m 3.6g 6628 R 100 7.7 33:43.64 7 3.8g - kvm
 7131 root 20 0 7568m 3.6g 6628 R 100 7.7 33:44.24 10 3.8g - kvm
 7134 root 20 0 7568m 3.6g 6628 R 100 7.7 33:42.47 12 3.8g - kvm
 7136 root 20 0 7568m 3.6g 6628 R 100 7.7 33:42.16 2 3.8g - kvm
 7124 root 20 0 7568m 3.6g 6628 S 1 7.7 0:30.65 14 3.8g poll_sche kvm
 7139 root 20 0 7568m 3.6g 6628 S 0 7.7 0:01.71 14 3.8g futex_wai kvm

all thread's kernel stack (guest OS XP):
root@cvknode2026:/proc/7124/task# ca