On 07/31/2012 06:50 AM, Alexander Graf wrote: > > On 25.07.2012, at 05:49, Bharat Bhushan wrote: > >> This patch adds the watchdog emulation in KVM. The watchdog >> emulation is enabled by KVM_ENABLE_CAP(KVM_CAP_PPC_BOOKE_WATCHDOG) ioctl. >> The kernel timer are used for watchdog emulation and emulates >> h/w watchdog state machine. On watchdog timer expiry, it exit to QEMU >> if TCR.WRC is non ZERO. QEMU can reset/shutdown etc depending upon how >> it is configured. >> >> Signed-off-by: Liu Yu <[email protected]> >> Signed-off-by: Scott Wood <[email protected]> >> [[email protected]: reworked patch] >> Signed-off-by: Bharat Bhushan <[email protected]> >> --- >> v6: >> - Added kvmppc_subarch_vcpu_unit/uninit() for subarch specific initialization >> - Using spin_lock_irqsave() >> - Using del_timer_sync(). >> >> v5: >> - Checking that TSR_ENW/TSR_WIS are still set on KVM_EXIT_WATCHDOG. >> - Moved watchdog_next_timeout() in lock. >> >> arch/powerpc/include/asm/kvm_host.h | 3 + >> arch/powerpc/include/asm/kvm_ppc.h | 4 + >> arch/powerpc/include/asm/reg_booke.h | 7 ++ >> arch/powerpc/kvm/book3s.c | 9 ++ >> arch/powerpc/kvm/booke.c | 156 >> ++++++++++++++++++++++++++++++++++ >> arch/powerpc/kvm/booke_emulate.c | 8 ++ >> arch/powerpc/kvm/powerpc.c | 14 +++- >> include/linux/kvm.h | 2 + >> include/linux/kvm_host.h | 1 + >> 9 files changed, 202 insertions(+), 2 deletions(-) >> >> diff --git a/arch/powerpc/include/asm/kvm_host.h >> b/arch/powerpc/include/asm/kvm_host.h >> index 50ea12f..43cac48 100644 >> --- a/arch/powerpc/include/asm/kvm_host.h >> +++ b/arch/powerpc/include/asm/kvm_host.h >> @@ -467,6 +467,8 @@ struct kvm_vcpu_arch { >> ulong fault_esr; >> ulong queued_dear; >> ulong queued_esr; >> + spinlock_t wdt_lock; >> + struct timer_list wdt_timer; >> u32 tlbcfg[4]; >> u32 mmucfg; >> u32 epr; >> @@ -482,6 +484,7 @@ struct kvm_vcpu_arch { >> u8 osi_needed; >> u8 osi_enabled; >> u8 papr_enabled; >> + u8 watchdog_enabled; >> u8 sane; >> u8 cpu_type; >> u8 hcall_needed; >> diff --git a/arch/powerpc/include/asm/kvm_ppc.h >> b/arch/powerpc/include/asm/kvm_ppc.h >> index 0124937..823d563 100644 >> --- a/arch/powerpc/include/asm/kvm_ppc.h >> +++ b/arch/powerpc/include/asm/kvm_ppc.h >> @@ -68,6 +68,8 @@ extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); >> extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); >> extern void kvmppc_decrementer_func(unsigned long data); >> extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu); >> +extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu); >> +extern void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu); >> >> /* Core-specific hooks */ >> >> @@ -104,6 +106,8 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu >> *vcpu, >> struct kvm_interrupt *irq); >> extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, >> struct kvm_interrupt *irq); >> +extern void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu); >> +extern void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu); >> >> extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, >> unsigned int op, int *advance); >> diff --git a/arch/powerpc/include/asm/reg_booke.h >> b/arch/powerpc/include/asm/reg_booke.h >> index 2d916c4..e07e6af 100644 >> --- a/arch/powerpc/include/asm/reg_booke.h >> +++ b/arch/powerpc/include/asm/reg_booke.h >> @@ -539,6 +539,13 @@ >> #define TCR_FIE 0x00800000 /* FIT Interrupt Enable */ >> #define TCR_ARE 0x00400000 /* Auto Reload Enable */ >> >> +#ifdef CONFIG_E500 >> +#define TCR_GET_WP(tcr) ((((tcr) & 0xC0000000) >> 30) | \ >> + (((tcr) & 0x1E0000) >> 15)) >> +#else >> +#define TCR_GET_WP(tcr) (((tcr) & 0xC0000000) >> 30) >> +#endif >> + >> /* Bit definitions for the TSR. */ >> #define TSR_ENW 0x80000000 /* Enable Next Watchdog */ >> #define TSR_WIS 0x40000000 /* WDT Interrupt Status */ >> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c >> index 3f2a836..e946665 100644 >> --- a/arch/powerpc/kvm/book3s.c >> +++ b/arch/powerpc/kvm/book3s.c >> @@ -411,6 +411,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) >> return 0; >> } >> >> +int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) >> +{ >> + return 0; >> +} >> + >> +void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) >> +{ >> +} >> + >> int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs >> *regs) >> { >> int i; >> diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c >> index d25a097..eadd86c 100644 >> --- a/arch/powerpc/kvm/booke.c >> +++ b/arch/powerpc/kvm/booke.c >> @@ -206,6 +206,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, >> clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); >> } >> >> +void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu) >> +{ >> + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG); >> +} >> + >> +void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu) >> +{ >> + clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions); >> +} >> + > > These should be static, since we only need them in booke specific code. > >> static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 >> srr1) >> { >> #ifdef CONFIG_KVM_BOOKE_HV >> @@ -325,6 +335,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu >> *vcpu, >> msr_mask = MSR_CE | MSR_ME | MSR_DE; >> int_class = INT_CLASS_NONCRIT; >> break; >> + case BOOKE_IRQPRIO_WATCHDOG: >> case BOOKE_IRQPRIO_CRITICAL: >> case BOOKE_IRQPRIO_DBELL_CRIT: >> allowed = vcpu->arch.shared->msr & MSR_CE; >> @@ -404,12 +415,114 @@ static int kvmppc_booke_irqprio_deliver(struct >> kvm_vcpu *vcpu, >> return allowed; >> } >> >> +/* >> + * Return the number of jiffies until the next timeout. If the timeout is >> + * longer than the NEXT_TIMER_MAX_DELTA, then return NEXT_TIMER_MAX_DELTA >> + * because the larger value can break the timer APIs. >> + */ >> +static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu) >> +{ >> + u64 tb, wdt_tb, wdt_ticks = 0; >> + u64 nr_jiffies = 0; >> + u32 period = TCR_GET_WP(vcpu->arch.tcr); >> + >> + wdt_tb = 1ULL << (63 - period); >> + tb = get_tb(); >> + /* >> + * The watchdog timeout will hapeen when TB bit corresponding >> + * to watchdog will toggle from 0 to 1. >> + */ >> + if (tb & wdt_tb) >> + wdt_ticks = wdt_tb; >> + >> + wdt_ticks += wdt_tb - (tb & (wdt_tb - 1)); >> + >> + /* Convert timebase ticks to jiffies */ >> + nr_jiffies = wdt_ticks; >> + >> + if (do_div(nr_jiffies, tb_ticks_per_jiffy)) >> + nr_jiffies++; >> + >> + return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA); >> +} >> + >> +static void arm_next_watchdog(struct kvm_vcpu *vcpu) >> +{ >> + unsigned long nr_jiffies; >> + unsigned long flags; >> + >> + spin_lock_irqsave(&vcpu->arch.wdt_lock, flags); >> + nr_jiffies = watchdog_next_timeout(vcpu); >> + /* >> + * If the number of jiffies of watchdog timer >= NEXT_TIMER_MAX_DELTA >> + * then do not run the watchdog timer as this can break timer APIs. >> + */ >> + if (nr_jiffies < NEXT_TIMER_MAX_DELTA) >> + mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies); >> + else >> + del_timer(&vcpu->arch.wdt_timer); >> + spin_unlock_irqrestore(&vcpu->arch.wdt_lock, flags); >> +} >> + >> +void kvmppc_watchdog_func(unsigned long data) >> +{ >> + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; >> + u32 tsr, new_tsr; >> + int final; >> + >> + do { >> + new_tsr = tsr = vcpu->arch.tsr; >> + final = 0; >> + >> + /* Time out event */ >> + if (tsr & TSR_ENW) { >> + if (tsr & TSR_WIS) >> + final = 1; >> + else >> + new_tsr = tsr | TSR_WIS; >> + } else { >> + new_tsr = tsr | TSR_ENW; >> + } >> + } while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr); >> + >> + if (new_tsr & TSR_WIS) { >> + smp_wmb(); >> + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); >> + kvm_vcpu_kick(vcpu); >> + } >> + >> + /* >> + * If this is final watchdog expiry and some action is required >> + * then exit to userspace. >> + */ >> + if (final && (vcpu->arch.tcr & TCR_WRC_MASK) && >> + vcpu->arch.watchdog_enabled) { >> + smp_wmb(); >> + kvm_make_request(KVM_REQ_WATCHDOG, vcpu); >> + kvm_vcpu_kick(vcpu); >> + } >> + >> + /* >> + * Stop running the watchdog timer after final expiration to >> + * prevent the host from being flooded with timers if the >> + * guest sets a short period. >> + * Timers will resume when TSR/TCR is updated next time. >> + */ >> + if (!final) >> + arm_next_watchdog(vcpu); >> +} >> + >> static void update_timer_ints(struct kvm_vcpu *vcpu) >> { >> if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS)) >> kvmppc_core_queue_dec(vcpu); >> else >> kvmppc_core_dequeue_dec(vcpu); >> + >> + if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS)) >> + kvmppc_core_queue_watchdog(vcpu); >> + else >> + kvmppc_core_dequeue_watchdog(vcpu); >> } >> >> static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) >> @@ -516,6 +629,13 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct >> kvm_vcpu *vcpu) >> goto out; >> } >> >> + if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu) && >> + (vcpu->arch.tsr & (TSR_ENW | TSR_WIS))) { >> + kvm_run->exit_reason = KVM_EXIT_WATCHDOG; >> + ret = 0; >> + goto out; >> + } > > I think we want to push this into a generic function that checks for > requests. Check out > > > https://github.com/agraf/linux-2.6/commit/e8fe975b59b24777c6314addc9cea2b959454738 > > for what I did there, namely kvmppc_check_requests. If your case we obviously > want a return value. I don't mind which patch goes in first, so if you add a > nice and generic version first, I can easily base my mmu notifier patches on > top of yours. > >> + >> kvm_guest_enter(); >> >> #ifdef CONFIG_PPC_FPU >> @@ -978,6 +1098,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct >> kvm_vcpu *vcpu, >> } >> } >> >> + if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu) && >> + (vcpu->arch.tsr & (TSR_ENW | TSR_WIS))) { >> + run->exit_reason = KVM_EXIT_WATCHDOG; >> + r = RESUME_HOST | (r & RESUME_FLAG_NV); >> + } >> + >> return r; >> } >> >> @@ -1011,6 +1137,21 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) >> return r; >> } >> >> +int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) >> +{ >> + /* setup watchdog timer once */ >> + spin_lock_init(&vcpu->arch.wdt_lock); >> + setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func, >> + (unsigned long)vcpu); >> + >> + return 0; >> +} >> + >> +void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) >> +{ >> + del_timer_sync(&vcpu->arch.wdt_timer); >> +} >> + >> int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs >> *regs) >> { >> int i; >> @@ -1106,7 +1247,13 @@ static int set_sregs_base(struct kvm_vcpu *vcpu, >> } >> >> if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) { >> + u32 old_tsr = vcpu->arch.tsr; >> + >> vcpu->arch.tsr = sregs->u.e.tsr; >> + >> + if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS)) >> + arm_next_watchdog(vcpu); > > Just curious. Is this potentially racy with the watchdog timer, since we're > not accessing tsr atomically.
Userspace updating TSR is racy (not just with the watchdog). That's why userspace needs to set a special flag for it to happen. It's meant for reset, migration, etc. rather than normal runtime operations. This could be a problem if we want QEMU to clear the watchdog when resuming from debug halt -- we don't want to miss a decrementer that happens around the same time. Maybe we should have separate virtual registers (via one-reg) for each timer source. The only new raciness I can see is if we race with a watchdog timer that determines itself to be the final expiration, and thus doesn't run arm_next_watchdog itself. In practice, I think you'd need three watchdog timers to go off between during the above code, so it's not very realistic, but I suppose you could use a cmpxchg to avoid it. -Scott -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to [email protected] More majordomo info at http://vger.kernel.org/majordomo-info.html
