On 28.06.2012, at 08:17, Bharat Bhushan wrote:
> This patch adds the watchdog emulation in KVM. The watchdog
> emulation is enabled by KVM_ENABLE_CAP(KVM_CAP_PPC_WDT) ioctl.
> The kernel timer are used for watchdog emulation and emulates
> h/w watchdog state machine. On watchdog timer expiry, it exit to QEMU
> if TCR.WRC is non ZERO. QEMU can reset/shutdown etc depending upon how
> it is configured.
>
> Signed-off-by: Liu Yu <[email protected]>
> Signed-off-by: Scott Wood <[email protected]>
> Signed-off-by: Bharat Bhushan <[email protected]>
> ---
> arch/powerpc/include/asm/kvm_host.h | 2 +
> arch/powerpc/include/asm/kvm_ppc.h | 3 +
> arch/powerpc/include/asm/reg_booke.h | 2 +
> arch/powerpc/kvm/44x.c | 1 +
> arch/powerpc/kvm/booke.c | 125 ++++++++++++++++++++++++++++++++++
> arch/powerpc/kvm/booke_emulate.c | 6 ++-
> arch/powerpc/kvm/powerpc.c | 25 ++++++-
> include/linux/kvm.h | 2 +
> 8 files changed, 162 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/kvm_host.h
> b/arch/powerpc/include/asm/kvm_host.h
> index 50ea12f..a9e5aed 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -467,6 +467,7 @@ struct kvm_vcpu_arch {
> ulong fault_esr;
> ulong queued_dear;
> ulong queued_esr;
> + struct timer_list wdt_timer;
> u32 tlbcfg[4];
> u32 mmucfg;
> u32 epr;
> @@ -482,6 +483,7 @@ struct kvm_vcpu_arch {
> u8 osi_needed;
> u8 osi_enabled;
> u8 papr_enabled;
> + u8 watchdog_enable;
> u8 sane;
> u8 cpu_type;
> u8 hcall_needed;
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h
> b/arch/powerpc/include/asm/kvm_ppc.h
> index 0124937..e5cf4b9 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -67,6 +67,7 @@ extern int kvmppc_emulate_mmio(struct kvm_run *run, struct
> kvm_vcpu *vcpu);
> extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
> extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
> extern void kvmppc_decrementer_func(unsigned long data);
> +extern void kvmppc_watchdog_func(unsigned long data);
> extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
>
> /* Core-specific hooks */
> @@ -104,6 +105,8 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu
> *vcpu,
> struct kvm_interrupt *irq);
> extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
> struct kvm_interrupt *irq);
> +extern void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu);
> +extern void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu);
>
> extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
> unsigned int op, int *advance);
> diff --git a/arch/powerpc/include/asm/reg_booke.h
> b/arch/powerpc/include/asm/reg_booke.h
> index 2d916c4..7efb3f5 100644
> --- a/arch/powerpc/include/asm/reg_booke.h
> +++ b/arch/powerpc/include/asm/reg_booke.h
> @@ -538,6 +538,8 @@
> #define FP_2_21 3 /* 2^21 clocks */
> #define TCR_FIE 0x00800000 /* FIT Interrupt Enable */
> #define TCR_ARE 0x00400000 /* Auto Reload Enable */
> +#define TCR_GET_FSL_WP(tcr) ((((tcr) & 0xC0000000) >> 30) | \
> + (((tcr) & 0x1E0000) >> 15))
>
> /* Bit definitions for the TSR. */
> #define TSR_ENW 0x80000000 /* Enable Next Watchdog */
> diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
> index 50e7dbc..a213aba 100644
> --- a/arch/powerpc/kvm/44x.c
> +++ b/arch/powerpc/kvm/44x.c
> @@ -28,6 +28,7 @@
> #include <asm/kvm_44x.h>
> #include <asm/kvm_ppc.h>
>
> +#include "booke.h"
> #include "44x_tlb.h"
> #include "booke.h"
>
> diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
> index d25a097..ad30f75 100644
> --- a/arch/powerpc/kvm/booke.c
> +++ b/arch/powerpc/kvm/booke.c
> @@ -206,6 +206,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
> clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
> }
>
> +void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu)
> +{
> + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG);
> +}
> +
> +void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu)
> +{
> + clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions);
> +}
> +
> static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
> {
> #ifdef CONFIG_KVM_BOOKE_HV
> @@ -325,6 +335,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu
> *vcpu,
> msr_mask = MSR_CE | MSR_ME | MSR_DE;
> int_class = INT_CLASS_NONCRIT;
> break;
> + case BOOKE_IRQPRIO_WATCHDOG:
> case BOOKE_IRQPRIO_CRITICAL:
> case BOOKE_IRQPRIO_DBELL_CRIT:
> allowed = vcpu->arch.shared->msr & MSR_CE;
> @@ -404,12 +415,98 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu
> *vcpu,
> return allowed;
> }
>
> +/*
> + * The timer system can almost deal with LONG_MAX timeouts, except that
> + * when you get very close to LONG_MAX, the slack added can cause overflow.
> + *
> + * LONG_MAX/2 is a conservative threshold, but it should be adequate for
> + * any realistic use.
> + */
> +#define MAX_TIMEOUT (LONG_MAX/2)
Should this really be in kvm code?
> +
> +/*
> + * Return the number of jiffies until the next timeout. If the timeout is
> + * longer than the MAX_TIMEOUT, that we return MAX_TIMEOUT instead.
> + */
> +static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu)
> +{
> + unsigned long long tb, mask, nr_jiffies = 0;
u64?
> + u32 period = TCR_GET_FSL_WP(vcpu->arch.tcr);
Doesn't sound like something booke generic to me.
> +
> + mask = 1ULL << (63 - period);
> + tb = get_tb();
> + if (tb & mask)
> + nr_jiffies += mask;
To be honest, you lost me here. nr_jiffies is jiffies, right? While mask is
basically in timebase granularity. I suppose you're just reusing the variable
here for no good reason - the compiler will gladly optimize things for you if
you write things a bit more verbose :).
Please make this function more readable. With comments if needed.
> +
> + nr_jiffies += mask - (tb & (mask - 1));
> +
> + if (do_div(nr_jiffies, tb_ticks_per_jiffy))
> + nr_jiffies++;
> +
> + return min_t(unsigned long long, nr_jiffies, MAX_TIMEOUT);
> +}
> +
> +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
> +{
> + unsigned long nr_jiffies;
> +
> + nr_jiffies = watchdog_next_timeout(vcpu);
> + if (nr_jiffies < MAX_TIMEOUT)
> + mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
> + else
> + del_timer(&vcpu->arch.wdt_timer);
Can you del a timer that's not armed? Could that ever happen in this case?
Also, could the timer possibly be running somewhere, so do we need
del_timer_sync? Or don't we need to care?
> +}
> +
> +void kvmppc_watchdog_func(unsigned long data)
> +{
> + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
> + u32 tsr, new_tsr;
> + int final;
> +
> + do {
> + new_tsr = tsr = vcpu->arch.tsr;
> + final = 0;
> +
> + /* Time out event */
> + if (tsr & TSR_ENW) {
> + if (tsr & TSR_WIS) {
> + new_tsr = (tsr & ~TCR_WRC_MASK) |
> + (vcpu->arch.tcr & TCR_WRC_MASK);
> + vcpu->arch.tcr &= ~TCR_WRC_MASK;
> + final = 1;
> + } else {
> + new_tsr = tsr | TSR_WIS;
> + }
> + } else {
> + new_tsr = tsr | TSR_ENW;
> + }
> + } while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
> +
> + if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
> + smp_wmb();
> + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
> + kvm_vcpu_kick(vcpu);
> + }
> +
> + /*
> + * Avoid getting a storm of timers if the guest sets
> + * the period very short. We'll restart it if anything
> + * changes.
> + */
> + if (!final)
> + arm_next_watchdog(vcpu);
Mind to explain this part a bit further?
> +}
> static void update_timer_ints(struct kvm_vcpu *vcpu)
> {
> if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS))
> kvmppc_core_queue_dec(vcpu);
> else
> kvmppc_core_dequeue_dec(vcpu);
> +
> + if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS))
> + kvmppc_core_queue_watchdog(vcpu);
> + else
> + kvmppc_core_dequeue_watchdog(vcpu);
> }
>
> static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
> @@ -516,6 +613,12 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct
> kvm_vcpu *vcpu)
> goto out;
> }
>
> + if (vcpu->arch.tsr & TCR_WRC_MASK) {
> + kvm_run->exit_reason = KVM_EXIT_WDT;
> + ret = 0;
> + goto out;
> + }
> +
> kvm_guest_enter();
>
> #ifdef CONFIG_PPC_FPU
> @@ -977,6 +1080,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct
> kvm_vcpu *vcpu,
> kvmppc_account_exit(vcpu, SIGNAL_EXITS);
> }
> }
> + if (vcpu->arch.tsr & TCR_WRC_MASK) {
> + run->exit_reason = KVM_EXIT_WDT;
> + r = RESUME_HOST | (r & RESUME_FLAG_NV);
> + }
>
> return r;
> }
> @@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
> }
>
> if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
> + u32 old_tsr = vcpu->arch.tsr;
> +
> vcpu->arch.tsr = sregs->u.e.tsr;
> +
> + if ((old_tsr ^ vcpu->arch.tsr) &
> + (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
> + arm_next_watchdog(vcpu);
Why isn't this one guarded by vcpu->arch.watchdog_enable?
> +
> update_timer_ints(vcpu);
> }
>
> @@ -1267,6 +1381,8 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
> void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr)
> {
> vcpu->arch.tcr = new_tcr;
> + if (vcpu->arch.watchdog_enable)
> + arm_next_watchdog(vcpu);
> update_timer_ints(vcpu);
> }
>
> @@ -1281,6 +1397,15 @@ void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32
> tsr_bits)
> void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
> {
> clear_bits(tsr_bits, &vcpu->arch.tsr);
> +
> + /*
> + * We may have stopped the watchdog due to
> + * being stuck on final expiration.
> + */
> + if (vcpu->arch.watchdog_enable && (tsr_bits & (TSR_ENW |
> + TSR_WIS | TCR_WRC_MASK)))
> + arm_next_watchdog(vcpu);
> +
> update_timer_ints(vcpu);
> }
>
> diff --git a/arch/powerpc/kvm/booke_emulate.c
> b/arch/powerpc/kvm/booke_emulate.c
> index 12834bb..cc94d42 100644
> --- a/arch/powerpc/kvm/booke_emulate.c
> +++ b/arch/powerpc/kvm/booke_emulate.c
> @@ -145,7 +145,11 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu,
> int sprn, ulong spr_val)
> kvmppc_clr_tsr_bits(vcpu, spr_val);
> break;
> case SPRN_TCR:
> - kvmppc_set_tcr(vcpu, spr_val);
> + /* WRC can only be programmed when WRC=0 */
> + if (TCR_WRC_MASK & vcpu->arch.tcr)
Please reverse the order.
> + spr_val &= ~TCR_WRC_MASK;
> + kvmppc_set_tcr(vcpu,
> + spr_val | (TCR_WRC_MASK & vcpu->arch.tcr));
In fact, what you're trying to do here is keep TCR_WRC always on when it was
enabled once. So all you need is the OR here. No need for the mask above.
> break;
>
> case SPRN_DECAR:
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index 87f4dc8..35a1ff3 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -38,9 +38,15 @@
>
> int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
> {
> - return !(v->arch.shared->msr & MSR_WE) ||
> - !!(v->arch.pending_exceptions) ||
> - v->requests;
> + bool ret = !(v->arch.shared->msr & MSR_WE) ||
> + !!(v->arch.pending_exceptions) ||
> + v->requests;
> +
> +#ifdef CONFIG_BOOKE
> + ret = ret || (v->arch.tsr & TCR_WRC_MASK);
Please make this a callback. In a header if you think it's performance
critical, but I don't want to see #ifdef CONFIG_BOOKE too often in powerpc.c.
> +#endif
> +
> + return ret;
> }
>
> int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
> @@ -237,6 +243,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> case KVM_CAP_PPC_GET_PVINFO:
> #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
> case KVM_CAP_SW_TLB:
> + case KVM_CAP_PPC_WDT:
> #endif
> r = 1;
> break;
> @@ -393,6 +400,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
> void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
> {
> kvmppc_mmu_destroy(vcpu);
> +#ifdef CONFIG_BOOKE
> + if (vcpu->arch.watchdog_enable)
> + del_timer(&vcpu->arch.wdt_timer);
> +#endif
> }
>
> void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> @@ -637,6 +648,14 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu
> *vcpu,
> r = 0;
> vcpu->arch.papr_enabled = true;
> break;
> +#ifdef CONFIG_BOOKE
> + case KVM_CAP_PPC_WDT:
> + r = 0;
> + vcpu->arch.watchdog_enable = true;
> + setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func,
> + (unsigned long)vcpu);
Shouldn't we guard against user space calling this twice?
Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html