Hi Marcelo:
I am seeing erroneous accounting data in RHEL3 guests which I believe I
have traced to this patch. The easiest way to see this is to run 'mpstat
1': intr/s is in the 50's (e.g., for a nearly idle guest with negligible
disk/network). This is wrong. At a minimum it should be 100 -- 100 timer
interrupts per second.
Once this caught my eye, I took a look at /proc/stat. If you take
samples 1 second apart, the difference of the sums for the 'cpu' line
should be HZ * ncpus and for each individual cpu entry (e.g., cpu0,
cpu1, etc) the result should be HZ. In code:
function cpu_stat {
awk -v cpu="$1" '{
if ($1 == cpu)
{
sum=0
for (i=1; i<= NF; ++i)
sum += $i
print sum
}
}' /proc/stat
}
cpu=${1:-"cpu"}
d1=$(cpu_stat $cpu)
echo "have first sample. sleeping"
usleep 990000
d2=$(cpu_stat $cpu)
echo "delta: $(($d2 - $d1))"
I am seeing a result of 2*HZ. So for a 4 vcpu guest the delta for the
cpu line is > 800, and each cpu# line is >200.
You see the effect with the SMP kernel regardless of the number of vcpus
(1 or more) -- it's always twice what it should be and the interrupts
are always half what they should be. The accounting is fine with the
uniprocessor kernel. This suggests the problem is that lapic timer
interrupts are coming in twice as fast (or more) than they should.
Interestingly, I only see this for RHEL5.2 as the host OS; the
accounting is fine for Fedora 9 as the host OS. In both cases it's
kvm-72 with just this patch set.
Any ideas on where the problem could be?
david
Marcelo Tosatti wrote:
> The PIT injection logic is problematic under the following cases:
>
> 1) If there is a higher priority vector to be delivered by the time
> kvm_pit_timer_intr_post is invoked ps->inject_pending won't be set.
> This opens the possibility for missing many PIT event injections (say if
> guest executes hlt at this point).
>
> 2) ps->inject_pending is racy with more than two vcpus. Since there's no
> locking
> around read/dec of pt->pending, two vcpu's can inject two interrupts for a
> single
> pt->pending count.
>
> Fix 1 by using an irq ack notifier: only reinject when the previous irq
> has been acked. Fix 2 with appropriate locking around manipulation of
> pending count and irq_ack by the injection / ack paths.
>
> Also, count_load_time should be set at the time the count is reloaded,
> not when the interrupt is injected (BTW, LAPIC uses the same apparently
> broken scheme, could someone explain what was the reasoning behind
> that? kvm_apic_timer_intr_post).
>
> Signed-off-by: Marcelo Tosatti <[EMAIL PROTECTED]>
>
> Index: kvm/arch/x86/kvm/i8254.c
> ===================================================================
> --- kvm.orig/arch/x86/kvm/i8254.c
> +++ kvm/arch/x86/kvm/i8254.c
> @@ -207,6 +207,8 @@ static int __pit_timer_fn(struct kvm_kpi
>
> pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
> pt->scheduled = ktime_to_ns(pt->timer.expires);
> + if (pt->period)
> + ps->channels[0].count_load_time = pt->timer.expires;
>
> return (pt->period == 0 ? 0 : 1);
> }
> @@ -215,12 +217,22 @@ int pit_has_pending_timer(struct kvm_vcp
> {
> struct kvm_pit *pit = vcpu->kvm->arch.vpit;
>
> - if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending)
> + if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack)
> return atomic_read(&pit->pit_state.pit_timer.pending);
> -
> return 0;
> }
>
> +void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
> +{
> + struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
> + irq_ack_notifier);
> + spin_lock(&ps->inject_lock);
> + if (atomic_dec_return(&ps->pit_timer.pending) < 0)
> + WARN_ON(1);
> + ps->irq_ack = 1;
> + spin_unlock(&ps->inject_lock);
> +}
> +
> static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
> {
> struct kvm_kpit_state *ps;
> @@ -255,8 +267,9 @@ static void destroy_pit_timer(struct kvm
> hrtimer_cancel(&pt->timer);
> }
>
> -static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int
> is_period)
> +static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int
> is_period)
> {
> + struct kvm_kpit_timer *pt = &ps->pit_timer;
> s64 interval;
>
> interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
> @@ -268,6 +281,7 @@ static void create_pit_timer(struct kvm_
> pt->period = (is_period == 0) ? 0 : interval;
> pt->timer.function = pit_timer_fn;
> atomic_set(&pt->pending, 0);
> + ps->irq_ack = 1;
>
> hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
> HRTIMER_MODE_ABS);
> @@ -302,11 +316,11 @@ static void pit_load_count(struct kvm *k
> case 1:
> /* FIXME: enhance mode 4 precision */
> case 4:
> - create_pit_timer(&ps->pit_timer, val, 0);
> + create_pit_timer(ps, val, 0);
> break;
> case 2:
> case 3:
> - create_pit_timer(&ps->pit_timer, val, 1);
> + create_pit_timer(ps, val, 1);
> break;
> default:
> destroy_pit_timer(&ps->pit_timer);
> @@ -520,7 +534,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
> mutex_unlock(&pit->pit_state.lock);
>
> atomic_set(&pit->pit_state.pit_timer.pending, 0);
> - pit->pit_state.inject_pending = 1;
> + pit->pit_state.irq_ack = 1;
> }
>
> struct kvm_pit *kvm_create_pit(struct kvm *kvm)
> @@ -534,6 +548,7 @@ struct kvm_pit *kvm_create_pit(struct kv
>
> mutex_init(&pit->pit_state.lock);
> mutex_lock(&pit->pit_state.lock);
> + spin_lock_init(&pit->pit_state.inject_lock);
>
> /* Initialize PIO device */
> pit->dev.read = pit_ioport_read;
> @@ -555,6 +570,9 @@ struct kvm_pit *kvm_create_pit(struct kv
> pit_state->pit = pit;
> hrtimer_init(&pit_state->pit_timer.timer,
> CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
> + pit_state->irq_ack_notifier.gsi = 0;
> + pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
> + kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
> mutex_unlock(&pit->pit_state.lock);
>
> kvm_pit_reset(pit);
> @@ -592,37 +610,19 @@ void kvm_inject_pit_timer_irqs(struct kv
> struct kvm_kpit_state *ps;
>
> if (vcpu && pit) {
> + int inject = 0;
> ps = &pit->pit_state;
>
> - /* Try to inject pending interrupts when:
> - * 1. Pending exists
> - * 2. Last interrupt was accepted or waited for too long time*/
> - if (atomic_read(&ps->pit_timer.pending) &&
> - (ps->inject_pending ||
> - (jiffies - ps->last_injected_time
> - >= KVM_MAX_PIT_INTR_INTERVAL))) {
> - ps->inject_pending = 0;
> - __inject_pit_timer_intr(kvm);
> - ps->last_injected_time = jiffies;
> - }
> - }
> -}
> -
> -void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
> -{
> - struct kvm_arch *arch = &vcpu->kvm->arch;
> - struct kvm_kpit_state *ps;
> -
> - if (vcpu && arch->vpit) {
> - ps = &arch->vpit->pit_state;
> - if (atomic_read(&ps->pit_timer.pending) &&
> - (((arch->vpic->pics[0].imr & 1) == 0 &&
> - arch->vpic->pics[0].irq_base == vec) ||
> - (arch->vioapic->redirtbl[0].fields.vector == vec &&
> - arch->vioapic->redirtbl[0].fields.mask != 1))) {
> - ps->inject_pending = 1;
> - atomic_dec(&ps->pit_timer.pending);
> - ps->channels[0].count_load_time = ktime_get();
> + /* Try to inject pending interrupts when
> + * last one has been acked.
> + */
> + spin_lock(&ps->inject_lock);
> + if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
> + ps->irq_ack = 0;
> + inject = 1;
> }
> + spin_unlock(&ps->inject_lock);
> + if (inject)
> + __inject_pit_timer_intr(kvm);
> }
> }
> Index: kvm/arch/x86/kvm/i8254.h
> ===================================================================
> --- kvm.orig/arch/x86/kvm/i8254.h
> +++ kvm/arch/x86/kvm/i8254.h
> @@ -8,7 +8,6 @@ struct kvm_kpit_timer {
> int irq;
> s64 period; /* unit: ns */
> s64 scheduled;
> - ktime_t last_update;
> atomic_t pending;
> };
>
> @@ -34,8 +33,9 @@ struct kvm_kpit_state {
> u32 speaker_data_on;
> struct mutex lock;
> struct kvm_pit *pit;
> - bool inject_pending; /* if inject pending interrupts */
> - unsigned long last_injected_time;
> + spinlock_t inject_lock;
> + unsigned long irq_ack;
> + struct kvm_irq_ack_notifier irq_ack_notifier;
> };
>
> struct kvm_pit {
> @@ -54,7 +54,6 @@ struct kvm_pit {
> #define KVM_PIT_CHANNEL_MASK 0x3
>
> void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
> -void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
> void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
> struct kvm_pit *kvm_create_pit(struct kvm *kvm);
> void kvm_free_pit(struct kvm *kvm);
> Index: kvm/arch/x86/kvm/irq.c
> ===================================================================
> --- kvm.orig/arch/x86/kvm/irq.c
> +++ kvm/arch/x86/kvm/irq.c
> @@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_tim
> void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
> {
> kvm_apic_timer_intr_post(vcpu, vec);
> - kvm_pit_timer_intr_post(vcpu, vec);
> /* TODO: PIT, RTC etc. */
> }
> EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at http://vger.kernel.org/majordomo-info.html