Hi Avi,

On Sun, Jan 13, 2008 at 02:19:29PM +0200, Avi Kivity wrote:
> Marcelo Tosatti wrote:
> >The boot TSC sync check is failing on recent Linux SMP guests on TSC
> >stable hosts.
> >
> >  
> 
> What about tsc unstable hosts?  If your patch convinces the guest its 
> tsc is table, while the host tsc is not, then it may cause confusion 
> later on.

The adjustment to zero won't fool the guest because it assumes that the
TSC's are synchronized. It will simply set the guest TSC to zero on all
VCPUs based on the time VCPU0 was initialized.

That is, setting -(vcpu[0].first_tsc) on all VCPU's will not correct any
synchronization problem.

> >Following patch attempts to address the problems, which are:
> >
> >1) APIC_DM_STARTUP, which is only called for vcpu's other than vcpu0,
> >will trigger ->vcpu_reset(), setting the TSC to 0. Fix that by moving
> >the guest_write_tsc(0) to vcpu_load.
> >
> >2) vcpu's are initialized at different times by QEMU (vcpu0 init happens
> >way earlier than the rest). Fix that by initializing the TSC of vcpu's >
> >0 with reference to vcpu0 init tsc value. This way TSC synchronization
> >is kept (apparently Xen does something similar).
> >
> >3) The code which adjusts the TSC of a VCPU on physical CPU switch is
> >meant to guarantee that the guest sees a monotonically increasing value.
> >However there is a large gap, in terms of clocks, between the time which
> >the source CPU TSC is read and the time the destination CPU TSC is read.
> >So move those two reads to happen at vcpu_clear.
> >
> >I believe that 3) is the reason for the -no-kvm-irqchip problems
> >reported by Avi on RHEL5, with kernels < 2.6.21 (where vcpu->cpu pinning
> >would fix the problem). Unfortunately I could reproduce that problem.
> >
> >4-way guest with constant tick at 250Hz now reliably sees the TSC's
> >synchronized, and idle guest CPU consumption is reduced by 50% (3-4%
> >instead of 8%, the latter with acpi_pm_good boot parameter).
> >
> >
> >diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> >index 4741806..e1c8cf4 100644
> >--- a/arch/x86/kvm/vmx.c
> >+++ b/arch/x86/kvm/vmx.c
> >@@ -47,6 +47,8 @@ struct vcpu_vmx {
> >     struct kvm_vcpu       vcpu;
> >     int                   launched;
> >     u8                    fail;
> >+    u64                   first_tsc;
> >+    u64                   tsc_this;
> >     u32                   idt_vectoring_info;
> >     struct kvm_msr_entry *guest_msrs;
> >     struct kvm_msr_entry *host_msrs;
> >@@ -254,6 +256,7 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
> >     if (vmx->vcpu.cpu == -1)
> >             return;
> >     smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1);
> >+    rdtscll(vmx->tsc_this);
> >     vmx->launched = 0;
> > }
> > 
> >@@ -480,6 +483,8 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
> >     reload_host_efer(vmx);
> > }
> > 
> >+static void guest_write_tsc(u64 host_tsc, u64 guest_tsc);
> >+
> > /*
> >  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
> >  * vcpu mutex is already taken.
> >@@ -488,7 +493,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int 
> >cpu)
> > {
> >     struct vcpu_vmx *vmx = to_vmx(vcpu);
> >     u64 phys_addr = __pa(vmx->vmcs);
> >-    u64 tsc_this, delta;
> >+    u64 delta;
> > 
> >     if (vcpu->cpu != cpu) {
> >             vcpu_clear(vmx);
> >@@ -511,6 +516,19 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int 
> >cpu)
> >             struct descriptor_table dt;
> >             unsigned long sysenter_esp;
> > 
> >+            if (unlikely(vcpu->cpu == -1)) {
> >  
> 
> This can happen after migration, I believe.

Right now the destination host will do ->vcpu_reset() on all VCPU's on
startup... so its already broken. This patch is not introducing any
issues, just changing where it happens :)

Perhaps fixing migration should be the subject of a separate patch?

> >+                    rdtscll(vcpu->arch.host_tsc);
> >+                    vmx->tsc_this = vcpu->arch.host_tsc;
> >+                    if (vcpu->vcpu_id == 0) {
> >+                            guest_write_tsc(vcpu->arch.host_tsc, 0);
> >+                            vmx->first_tsc = vcpu->arch.host_tsc;
> >+                    } else {
> >+                            struct vcpu_vmx *cpu0;
> >+                            cpu0 = to_vmx(vcpu->kvm->vcpus[0]);
> >+                            guest_write_tsc(cpu0->first_tsc, 0);
> >+                    }
> >+            }
> >+
> >  
> 
> Depending on vcpu_id == 0 can cause problems (for example, if vcpu 0 is 
> somehow not the first to run).

But QEMU will always run kvm_create() first (which does
kvm_create_vcpu(0)), then start the other threads later. So I don't see
how that can happen.

> We might initialize the tsc base on vm creation, and if the host tsc is 
> synced, then the guest tsc should also be.
> 
> >             vcpu->cpu = cpu;
> >             /*
> >              * Linux uses per-cpu TSS and GDT, so set these when 
> >              switching
> >@@ -526,8 +544,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int 
> >cpu)
> >             /*
> >              * Make sure the time stamp counter is monotonous.
> >              */
> >-            rdtscll(tsc_this);
> >-            delta = vcpu->arch.host_tsc - tsc_this;
> >+            delta = vcpu->arch.host_tsc - vmx->tsc_this;
> >             vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
> >  
> 
> This is a little roundabout, how about moving the delta calculation 
> immediately after the call to vcpu_clear()?
> 
> I don't think this is the cause of the problem, it can't account for 
> more than a few hundred cycles, compared to the much greater vmentry cost.

Actually it accounts for several thousand cycles warp by the time the
kernel checks the TSC synchronization between CPU's, which happens very
early on boot.

You saw the hang on userspace init, by then there could be many
VCPU->CPU switchs.

> 
> Anyway it should be in a separate patch.

How does the following look (minus proper changelog):


Index: kvm.quilt/arch/x86/kvm/vmx.c
===================================================================
--- kvm.quilt.orig/arch/x86/kvm/vmx.c
+++ kvm.quilt/arch/x86/kvm/vmx.c
@@ -47,6 +47,7 @@ struct vcpu_vmx {
        struct kvm_vcpu       vcpu;
        int                   launched;
        u8                    fail;
+       u64                   first_tsc;
        u32                   idt_vectoring_info;
        struct kvm_msr_entry *guest_msrs;
        struct kvm_msr_entry *host_msrs;
@@ -480,6 +481,7 @@ static void vmx_load_host_state(struct v
        reload_host_efer(vmx);
 }
 
+static void guest_write_tsc(u64 host_tsc, u64 guest_tsc);
 /*
  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
  * vcpu mutex is already taken.
@@ -511,6 +513,18 @@ static void vmx_vcpu_load(struct kvm_vcp
                struct descriptor_table dt;
                unsigned long sysenter_esp;
 
+               if (unlikely(vcpu->cpu == -1)) {
+                       rdtscll(vcpu->arch.host_tsc);
+                       if (vcpu->vcpu_id == 0) {
+                               guest_write_tsc(vcpu->arch.host_tsc, 0);
+                               vmx->first_tsc = vcpu->arch.host_tsc;
+                       } else {
+                               struct vcpu_vmx *cpu0;
+                               cpu0 = to_vmx(vcpu->kvm->vcpus[0]);
+                               guest_write_tsc(cpu0->first_tsc, 0);
+                       }
+               }
+
                vcpu->cpu = cpu;
                /*
                 * Linux uses per-cpu TSS and GDT, so set these when switching
@@ -690,11 +704,8 @@ static u64 guest_read_tsc(void)
  * writes 'guest_tsc' into guest's timestamp counter "register"
  * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
  */
-static void guest_write_tsc(u64 guest_tsc)
+static void guest_write_tsc(u64 host_tsc, u64 guest_tsc)
 {
-       u64 host_tsc;
-
-       rdtscll(host_tsc);
        vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
 }
 
@@ -758,6 +769,7 @@ static int vmx_set_msr(struct kvm_vcpu *
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct kvm_msr_entry *msr;
+       u64 host_tsc;
        int ret = 0;
 
        switch (msr_index) {
@@ -786,7 +798,8 @@ static int vmx_set_msr(struct kvm_vcpu *
                vmcs_writel(GUEST_SYSENTER_ESP, data);
                break;
        case MSR_IA32_TIME_STAMP_COUNTER:
-               guest_write_tsc(data);
+               rdtscll(host_tsc);
+               guest_write_tsc(host_tsc, data);
                break;
        default:
                msr = find_msr_entry(vmx, msr_index);
@@ -1684,8 +1697,6 @@ static int vmx_vcpu_reset(struct kvm_vcp
        vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
        vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
 
-       guest_write_tsc(0);
-
        /* Special registers */
        vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
 


Index: kvm.quilt/arch/x86/kvm/vmx.c
===================================================================
--- kvm.quilt.orig/arch/x86/kvm/vmx.c
+++ kvm.quilt/arch/x86/kvm/vmx.c
@@ -238,26 +238,6 @@ static void vmcs_clear(struct vmcs *vmcs
                       vmcs, phys_addr);
 }
 
-static void __vcpu_clear(void *arg)
-{
-       struct vcpu_vmx *vmx = arg;
-       int cpu = raw_smp_processor_id();
-
-       if (vmx->vcpu.cpu == cpu)
-               vmcs_clear(vmx->vmcs);
-       if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
-               per_cpu(current_vmcs, cpu) = NULL;
-       rdtscll(vmx->vcpu.arch.host_tsc);
-}
-
-static void vcpu_clear(struct vcpu_vmx *vmx)
-{
-       if (vmx->vcpu.cpu == -1)
-               return;
-       smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1);
-       vmx->launched = 0;
-}
-
 static unsigned long vmcs_readl(unsigned long field)
 {
        unsigned long value;
@@ -348,6 +328,34 @@ static void update_exception_bitmap(stru
        vmcs_write32(EXCEPTION_BITMAP, eb);
 }
 
+static void __vcpu_clear(void *arg)
+{
+       struct vcpu_vmx *vmx = arg;
+       int cpu = raw_smp_processor_id();
+
+       if (vmx->vcpu.cpu == cpu)
+               vmcs_clear(vmx->vmcs);
+       if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
+               per_cpu(current_vmcs, cpu) = NULL;
+       rdtscll(vmx->vcpu.arch.host_tsc);
+}
+
+static void vcpu_clear(struct vcpu_vmx *vmx)
+{
+       u64 tsc_this, delta;
+
+       if (vmx->vcpu.cpu == -1)
+               return;
+       smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1);
+       /*
+        * Make sure the time stamp counter is monotonous.
+        */
+       rdtscll(tsc_this);
+       delta = vmx->vcpu.arch.host_tsc - tsc_this;
+       vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
+       vmx->launched = 0;
+}
+
 static void reload_tss(void)
 {
 #ifndef CONFIG_X86_64
@@ -490,7 +498,6 @@ static void vmx_vcpu_load(struct kvm_vcp
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u64 phys_addr = __pa(vmx->vmcs);
-       u64 tsc_this, delta;
 
        if (vcpu->cpu != cpu) {
                vcpu_clear(vmx);
@@ -536,13 +543,6 @@ static void vmx_vcpu_load(struct kvm_vcp
 
                rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
                vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
-
-               /*
-                * Make sure the time stamp counter is monotonous.
-                */
-               rdtscll(tsc_this);
-               delta = vcpu->arch.host_tsc - tsc_this;
-               vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
        }
 }
 

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
_______________________________________________
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel

Reply via email to