On 07/31/2012 06:50 AM, Alexander Graf wrote:
> 
> On 25.07.2012, at 05:49, Bharat Bhushan wrote:
> 
>> This patch adds the watchdog emulation in KVM. The watchdog
>> emulation is enabled by KVM_ENABLE_CAP(KVM_CAP_PPC_BOOKE_WATCHDOG) ioctl.
>> The kernel timer are used for watchdog emulation and emulates
>> h/w watchdog state machine. On watchdog timer expiry, it exit to QEMU
>> if TCR.WRC is non ZERO. QEMU can reset/shutdown etc depending upon how
>> it is configured.
>>
>> Signed-off-by: Liu Yu <[email protected]>
>> Signed-off-by: Scott Wood <[email protected]>
>> [[email protected]: reworked patch]
>> Signed-off-by: Bharat Bhushan <[email protected]>
>> ---
>> v6:
>> - Added kvmppc_subarch_vcpu_unit/uninit() for subarch specific initialization
>> - Using spin_lock_irqsave()
>> - Using del_timer_sync().
>>
>> v5:
>> - Checking that TSR_ENW/TSR_WIS are still set on KVM_EXIT_WATCHDOG.
>> - Moved watchdog_next_timeout() in lock.
>>
>> arch/powerpc/include/asm/kvm_host.h  |    3 +
>> arch/powerpc/include/asm/kvm_ppc.h   |    4 +
>> arch/powerpc/include/asm/reg_booke.h |    7 ++
>> arch/powerpc/kvm/book3s.c            |    9 ++
>> arch/powerpc/kvm/booke.c             |  156 
>> ++++++++++++++++++++++++++++++++++
>> arch/powerpc/kvm/booke_emulate.c     |    8 ++
>> arch/powerpc/kvm/powerpc.c           |   14 +++-
>> include/linux/kvm.h                  |    2 +
>> include/linux/kvm_host.h             |    1 +
>> 9 files changed, 202 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/kvm_host.h 
>> b/arch/powerpc/include/asm/kvm_host.h
>> index 50ea12f..43cac48 100644
>> --- a/arch/powerpc/include/asm/kvm_host.h
>> +++ b/arch/powerpc/include/asm/kvm_host.h
>> @@ -467,6 +467,8 @@ struct kvm_vcpu_arch {
>>      ulong fault_esr;
>>      ulong queued_dear;
>>      ulong queued_esr;
>> +    spinlock_t wdt_lock;
>> +    struct timer_list wdt_timer;
>>      u32 tlbcfg[4];
>>      u32 mmucfg;
>>      u32 epr;
>> @@ -482,6 +484,7 @@ struct kvm_vcpu_arch {
>>      u8 osi_needed;
>>      u8 osi_enabled;
>>      u8 papr_enabled;
>> +    u8 watchdog_enabled;
>>      u8 sane;
>>      u8 cpu_type;
>>      u8 hcall_needed;
>> diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
>> b/arch/powerpc/include/asm/kvm_ppc.h
>> index 0124937..823d563 100644
>> --- a/arch/powerpc/include/asm/kvm_ppc.h
>> +++ b/arch/powerpc/include/asm/kvm_ppc.h
>> @@ -68,6 +68,8 @@ extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
>> extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
>> extern void kvmppc_decrementer_func(unsigned long data);
>> extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
>> +extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu);
>> +extern void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu);
>>
>> /* Core-specific hooks */
>>
>> @@ -104,6 +106,8 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu 
>> *vcpu,
>>                                        struct kvm_interrupt *irq);
>> extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
>>                                          struct kvm_interrupt *irq);
>> +extern void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu);
>> +extern void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu);
>>
>> extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
>>                                   unsigned int op, int *advance);
>> diff --git a/arch/powerpc/include/asm/reg_booke.h 
>> b/arch/powerpc/include/asm/reg_booke.h
>> index 2d916c4..e07e6af 100644
>> --- a/arch/powerpc/include/asm/reg_booke.h
>> +++ b/arch/powerpc/include/asm/reg_booke.h
>> @@ -539,6 +539,13 @@
>> #define TCR_FIE              0x00800000      /* FIT Interrupt Enable */
>> #define TCR_ARE              0x00400000      /* Auto Reload Enable */
>>
>> +#ifdef CONFIG_E500
>> +#define TCR_GET_WP(tcr)  ((((tcr) & 0xC0000000) >> 30) | \
>> +                          (((tcr) & 0x1E0000) >> 15))
>> +#else
>> +#define TCR_GET_WP(tcr)  (((tcr) & 0xC0000000) >> 30)
>> +#endif
>> +
>> /* Bit definitions for the TSR. */
>> #define TSR_ENW              0x80000000      /* Enable Next Watchdog */
>> #define TSR_WIS              0x40000000      /* WDT Interrupt Status */
>> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
>> index 3f2a836..e946665 100644
>> --- a/arch/powerpc/kvm/book3s.c
>> +++ b/arch/powerpc/kvm/book3s.c
>> @@ -411,6 +411,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
>>      return 0;
>> }
>>
>> +int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu)
>> +{
>> +    return 0;
>> +}
>> +
>> +void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu)
>> +{
>> +}
>> +
>> int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs 
>> *regs)
>> {
>>      int i;
>> diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
>> index d25a097..eadd86c 100644
>> --- a/arch/powerpc/kvm/booke.c
>> +++ b/arch/powerpc/kvm/booke.c
>> @@ -206,6 +206,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
>>      clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
>> }
>>
>> +void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu)
>> +{
>> +    kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG);
>> +}
>> +
>> +void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu)
>> +{
>> +    clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions);
>> +}
>> +
> 
> These should be static, since we only need them in booke specific code.
> 
>> static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 
>> srr1)
>> {
>> #ifdef CONFIG_KVM_BOOKE_HV
>> @@ -325,6 +335,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu 
>> *vcpu,
>>              msr_mask = MSR_CE | MSR_ME | MSR_DE;
>>              int_class = INT_CLASS_NONCRIT;
>>              break;
>> +    case BOOKE_IRQPRIO_WATCHDOG:
>>      case BOOKE_IRQPRIO_CRITICAL:
>>      case BOOKE_IRQPRIO_DBELL_CRIT:
>>              allowed = vcpu->arch.shared->msr & MSR_CE;
>> @@ -404,12 +415,114 @@ static int kvmppc_booke_irqprio_deliver(struct 
>> kvm_vcpu *vcpu,
>>      return allowed;
>> }
>>
>> +/*
>> + * Return the number of jiffies until the next timeout.  If the timeout is
>> + * longer than the NEXT_TIMER_MAX_DELTA, then return NEXT_TIMER_MAX_DELTA
>> + * because the larger value can break the timer APIs.
>> + */
>> +static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu)
>> +{
>> +    u64 tb, wdt_tb, wdt_ticks = 0;
>> +    u64 nr_jiffies = 0;
>> +    u32 period = TCR_GET_WP(vcpu->arch.tcr);
>> +
>> +    wdt_tb = 1ULL << (63 - period);
>> +    tb = get_tb();
>> +    /*
>> +     * The watchdog timeout will hapeen when TB bit corresponding
>> +     * to watchdog will toggle from 0 to 1.
>> +     */
>> +    if (tb & wdt_tb)
>> +            wdt_ticks = wdt_tb;
>> +
>> +    wdt_ticks += wdt_tb - (tb & (wdt_tb - 1));
>> +
>> +    /* Convert timebase ticks to jiffies */
>> +    nr_jiffies = wdt_ticks;
>> +
>> +    if (do_div(nr_jiffies, tb_ticks_per_jiffy))
>> +            nr_jiffies++;
>> +
>> +    return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA);
>> +}
>> +
>> +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
>> +{
>> +    unsigned long nr_jiffies;
>> +    unsigned long flags;
>> +
>> +    spin_lock_irqsave(&vcpu->arch.wdt_lock, flags);
>> +    nr_jiffies = watchdog_next_timeout(vcpu);
>> +    /*
>> +     * If the number of jiffies of watchdog timer >= NEXT_TIMER_MAX_DELTA
>> +     * then do not run the watchdog timer as this can break timer APIs.
>> +     */
>> +    if (nr_jiffies < NEXT_TIMER_MAX_DELTA)
>> +            mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
>> +    else
>> +            del_timer(&vcpu->arch.wdt_timer);
>> +    spin_unlock_irqrestore(&vcpu->arch.wdt_lock, flags);
>> +}
>> +
>> +void kvmppc_watchdog_func(unsigned long data)
>> +{
>> +    struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
>> +    u32 tsr, new_tsr;
>> +    int final;
>> +
>> +    do {
>> +            new_tsr = tsr = vcpu->arch.tsr;
>> +            final = 0;
>> +
>> +            /* Time out event */
>> +            if (tsr & TSR_ENW) {
>> +                    if (tsr & TSR_WIS)
>> +                            final = 1;
>> +                    else
>> +                            new_tsr = tsr | TSR_WIS;
>> +            } else {
>> +                    new_tsr = tsr | TSR_ENW;
>> +            }
>> +    } while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
>> +
>> +    if (new_tsr & TSR_WIS) {
>> +            smp_wmb();
>> +            kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
>> +            kvm_vcpu_kick(vcpu);
>> +    }
>> +
>> +    /*
>> +     * If this is final watchdog expiry and some action is required
>> +     * then exit to userspace.
>> +     */
>> +    if (final && (vcpu->arch.tcr & TCR_WRC_MASK) &&
>> +        vcpu->arch.watchdog_enabled) { 
>> +            smp_wmb();
>> +            kvm_make_request(KVM_REQ_WATCHDOG, vcpu);
>> +            kvm_vcpu_kick(vcpu);
>> +    }
>> +
>> +    /*
>> +     * Stop running the watchdog timer after final expiration to
>> +     * prevent the host from being flooded with timers if the
>> +     * guest sets a short period.
>> +     * Timers will resume when TSR/TCR is updated next time.
>> +     */
>> +    if (!final)
>> +            arm_next_watchdog(vcpu);
>> +}
>> +
>> static void update_timer_ints(struct kvm_vcpu *vcpu)
>> {
>>      if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS))
>>              kvmppc_core_queue_dec(vcpu);
>>      else
>>              kvmppc_core_dequeue_dec(vcpu);
>> +
>> +    if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS))
>> +            kvmppc_core_queue_watchdog(vcpu);
>> +    else
>> +            kvmppc_core_dequeue_watchdog(vcpu);
>> }
>>
>> static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
>> @@ -516,6 +629,13 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct 
>> kvm_vcpu *vcpu)
>>              goto out;
>>      }
>>
>> +    if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu) &&
>> +        (vcpu->arch.tsr & (TSR_ENW | TSR_WIS))) {
>> +            kvm_run->exit_reason = KVM_EXIT_WATCHDOG;
>> +            ret = 0;
>> +            goto out;
>> +    }
> 
> I think we want to push this into a generic function that checks for 
> requests. Check out
> 
>   
> https://github.com/agraf/linux-2.6/commit/e8fe975b59b24777c6314addc9cea2b959454738
> 
> for what I did there, namely kvmppc_check_requests. If your case we obviously 
> want a return value. I don't mind which patch goes in first, so if you add a 
> nice and generic version first, I can easily base my mmu notifier patches on 
> top of yours.
> 
>> +
>>      kvm_guest_enter();
>>
>> #ifdef CONFIG_PPC_FPU
>> @@ -978,6 +1098,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct 
>> kvm_vcpu *vcpu,
>>              }
>>      }
>>
>> +    if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu) &&
>> +        (vcpu->arch.tsr & (TSR_ENW | TSR_WIS))) {
>> +            run->exit_reason = KVM_EXIT_WATCHDOG;
>> +            r = RESUME_HOST | (r & RESUME_FLAG_NV);
>> +    }
>> +
>>      return r;
>> }
>>
>> @@ -1011,6 +1137,21 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
>>      return r;
>> }
>>
>> +int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu)
>> +{
>> +    /* setup watchdog timer once */
>> +    spin_lock_init(&vcpu->arch.wdt_lock);
>> +    setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func,
>> +                (unsigned long)vcpu);
>> +
>> +    return 0;
>> +}
>> +
>> +void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu)
>> +{
>> +    del_timer_sync(&vcpu->arch.wdt_timer);
>> +}
>> +
>> int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs 
>> *regs)
>> {
>>      int i;
>> @@ -1106,7 +1247,13 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
>>      }
>>
>>      if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
>> +            u32 old_tsr = vcpu->arch.tsr;
>> +
>>              vcpu->arch.tsr = sregs->u.e.tsr;
>> +
>> +            if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS))
>> +                    arm_next_watchdog(vcpu);
> 
> Just curious. Is this potentially racy with the watchdog timer, since we're 
> not accessing tsr atomically.

Userspace updating TSR is racy (not just with the watchdog).
That's why userspace needs to set a special flag for it to happen.  It's
meant for reset, migration, etc. rather than normal runtime operations.
 This could be a problem if we want QEMU to clear the watchdog when
resuming from debug halt -- we don't want to miss a decrementer that
happens around the same time.  Maybe we should have separate virtual
registers (via one-reg) for each timer source.

The only new raciness I can see is if we race with a watchdog timer that
determines itself to be the final expiration, and thus doesn't run
arm_next_watchdog itself.  In practice, I think you'd need three
watchdog timers to go off between during the above code, so it's not
very realistic, but I suppose you could use a cmpxchg to avoid it.

-Scott


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to