[RFC 1/2] Simulate Intel cpufreq MSRs in kvm guests to influence nice priority

Darrick J. Wong Tue, 15 Jul 2008 16:18:38 -0700

Hi all,

This patch set attempts to distinguish which guests are generating load
on the host because of interrupts, overhead, etc, against the guests
that are generating host load because they're truly doing something.
This is done by presenting cpufreq tables to kvm guests.  A guest that
is truly busy will select the highest CPU frequency, whereas a mostly
idle guest will select the lowest speed; based on this, we can change
the nice level of the guest CPU thread.


I envision four scenarios:

0. Guests that don't know about cpufreq still run at whatever nice level
they started with.

1. If we have a system with a lot of idle VMs, they will all run with +5
nice and this patch has no effect.

2. If we have a system with a lot of busy VMs, they all run with -5 nice
and this patch also has no effect.

3. If, however, we have a lot of idle VMs and a few busy ones, then the
-5 nice of the busy VMs will get those VMs extra CPU time.  On a really
crummy FPU microbenchmark I have, the score goes from about 500 to 2000
with the patch applied, though of course YMMV.  In some respects this
implementation shares a few ideas with the current Intel Dynamic
Acceleration implementation--you ask it for a speed that is higher than
what's written on the box, and if everything else is idle you actually
get the higher speed.  Otherwise you get what's written on the box.  But
you can't really know for sure.

There are some warts to this patch--most notably, the current
implementation uses the Intel MSRs and EST feature flag ... even if the
guest reports the CPU as being AuthenticAMD.  Also, there could be
timing problems introduced by this change--the OS thinks the CPU
frequency changes, but I don't know the effect on the guest CPU TSCs.

Questions?  Comments?  Please don't apply this to mainline.
---
This patch implements the Intel cpufreq control MSR.  Writes to the MSR are
used to bump up the nice level of the guest CPU thread if the OS picks a
sufficiently high p-state.

Control values are as as follows:
0: Nobody's touched cpufreq.  nice is the whatever the default is.
1: Lowest speed.  nice +5.
2: Medium speed.  nice is reset.
3: High speed.  nice -5.

The actual nice value is set via differential, so if a VM is started with a
nondefault nice priority it will fluctuate up and down from the initial value.

(This requires ACPI support from kvm-userspace, etc.)

Applies against vanilla 2.6.26.

Signed-off-by: Darrick J. Wong <[EMAIL PROTECTED]>
---

 arch/x86/kvm/x86.c         |   51 +++++++++++++++++++++++++++++++++++++++++---
 include/asm-x86/kvm_host.h |    1 +
 2 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 63a77ca..233ded2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/highmem.h>
+#include <linux/security.h>
 
 #include <asm/uaccess.h>
 #include <asm/msr.h>
@@ -431,7 +432,7 @@ static u32 msrs_to_save[] = {
        MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
        MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
-       MSR_IA32_PERF_STATUS,
+       MSR_IA32_PERF_STATUS, MSR_IA32_PERF_CTL,
 };
 
 static unsigned num_msrs_to_save;
@@ -604,6 +605,44 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
        mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
 }
 
+static int perf_ctl_to_nice(int pctl)
+{
+       switch (pctl) {
+       case 3: /* most favorable */
+               return 10;
+       case 2:
+               return 20;
+       case 1: /* least favorable */
+               return 30;
+       default:
+               return -EINVAL;
+       }
+}
+
+static void write_perf_ctl(struct kvm_vcpu *vcpu, u64 pctl)
+{
+       int new_nice;
+       int old_nice_boost = perf_ctl_to_nice(vcpu->arch.ia32_perf_ctl);
+       int new_nice_boost = perf_ctl_to_nice(pctl);
+
+       if (old_nice_boost < 0)
+               old_nice_boost = 0;
+       else
+               old_nice_boost -= 20;
+
+       if (new_nice_boost < 0)
+               return;
+       new_nice_boost -= 20;
+
+       new_nice = (new_nice_boost - old_nice_boost) + task_nice(current);
+       if (new_nice < -20)
+               new_nice = -20;
+       if (new_nice > 19)
+               new_nice = 19;
+
+       set_user_nice(current, new_nice);
+       vcpu->arch.ia32_perf_ctl = pctl;
+}
 
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
@@ -633,6 +672,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 
data)
        case MSR_IA32_MISC_ENABLE:
                vcpu->arch.ia32_misc_enable_msr = data;
                break;
+       case MSR_IA32_PERF_CTL:
+               write_perf_ctl(vcpu, data);
+               break;
        case MSR_KVM_WALL_CLOCK:
                vcpu->kvm->arch.wall_clock = data;
                kvm_write_wall_clock(vcpu->kvm, data);
@@ -717,13 +759,15 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, 
u64 *pdata)
                data = kvm_get_apic_base(vcpu);
                break;
        case MSR_IA32_MISC_ENABLE:
-               data = vcpu->arch.ia32_misc_enable_msr;
+               data = vcpu->arch.ia32_misc_enable_msr | 0x10000;
                break;
        case MSR_IA32_PERF_STATUS:
                /* TSC increment by tick */
                data = 1000ULL;
                /* CPU multiplier */
                data |= (((uint64_t)4ULL) << 40);
+       case MSR_IA32_PERF_CTL:
+               data = vcpu->arch.ia32_perf_ctl;
                break;
        case MSR_EFER:
                data = vcpu->arch.shadow_efer;
@@ -1113,7 +1157,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, 
u32 function,
                bit(X86_FEATURE_3DNOWEXT) |
                bit(X86_FEATURE_3DNOW);
        const u32 kvm_supported_word3_x86_features =
-               bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
+               bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16) |
+               bit(X86_FEATURE_EST);
        const u32 kvm_supported_word6_x86_features =
                bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
 
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 844f2a8..0bfa7bb 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -231,6 +231,7 @@ struct kvm_vcpu_arch {
        int mp_state;
        int sipi_vector;
        u64 ia32_misc_enable_msr;
+       u64 ia32_perf_ctl;
        bool tpr_access_reporting;
 
        struct kvm_mmu mmu;
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC 1/2] Simulate Intel cpufreq MSRs in kvm guests to influence nice priority

Reply via email to