Rather than have KVM look up the host timer and fiddle with the
irq-work internal details, have the powerpc/time.c code provide a
function for KVM to re-arm the Linux timer code when exiting a
guest.

This is implementation has an improvement over existing code of
marking a decrementer interrupt as soft-pending if a timer has
expired, rather than setting DEC to a -ve value, which tended to
cause host timers to take two interrupts (first hdec to exit the
guest, then the immediate dec).

Signed-off-by: Nicholas Piggin <npig...@gmail.com>
---
 arch/powerpc/include/asm/time.h | 16 +++-------
 arch/powerpc/kernel/time.c      | 52 +++++++++++++++++++++++++++------
 arch/powerpc/kvm/book3s_hv.c    |  7 ++---
 3 files changed, 49 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 69b6be617772..924b2157882f 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -99,18 +99,6 @@ extern void div128_by_32(u64 dividend_high, u64 dividend_low,
 extern void secondary_cpu_time_init(void);
 extern void __init time_init(void);
 
-#ifdef CONFIG_PPC64
-static inline unsigned long test_irq_work_pending(void)
-{
-       unsigned long x;
-
-       asm volatile("lbz %0,%1(13)"
-               : "=r" (x)
-               : "i" (offsetof(struct paca_struct, irq_work_pending)));
-       return x;
-}
-#endif
-
 DECLARE_PER_CPU(u64, decrementers_next_tb);
 
 static inline u64 timer_get_next_tb(void)
@@ -118,6 +106,10 @@ static inline u64 timer_get_next_tb(void)
        return __this_cpu_read(decrementers_next_tb);
 }
 
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+void timer_rearm_host_dec(u64 now);
+#endif
+
 /* Convert timebase ticks to nanoseconds */
 unsigned long long tb_to_ns(unsigned long long tb_ticks);
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 6ce40d2ac201..2a6c118a43fb 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -498,6 +498,16 @@ EXPORT_SYMBOL(profile_pc);
  * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
  */
 #ifdef CONFIG_PPC64
+static inline unsigned long test_irq_work_pending(void)
+{
+       unsigned long x;
+
+       asm volatile("lbz %0,%1(13)"
+               : "=r" (x)
+               : "i" (offsetof(struct paca_struct, irq_work_pending)));
+       return x;
+}
+
 static inline void set_irq_work_pending_flag(void)
 {
        asm volatile("stb %0,%1(13)" : :
@@ -541,13 +551,44 @@ void arch_irq_work_raise(void)
        preempt_enable();
 }
 
+static void set_dec_or_work(u64 val)
+{
+       set_dec(val);
+       /* We may have raced with new irq work */
+       if (unlikely(test_irq_work_pending()))
+               set_dec(1);
+}
+
 #else  /* CONFIG_IRQ_WORK */
 
 #define test_irq_work_pending()        0
 #define clear_irq_work_pending()
 
+static void set_dec_or_work(u64 val)
+{
+       set_dec(val);
+}
 #endif /* CONFIG_IRQ_WORK */
 
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+void timer_rearm_host_dec(u64 now)
+{
+       u64 *next_tb = this_cpu_ptr(&decrementers_next_tb);
+
+       WARN_ON_ONCE(!arch_irqs_disabled());
+       WARN_ON_ONCE(mfmsr() & MSR_EE);
+
+       if (now >= *next_tb) {
+               local_paca->irq_happened |= PACA_IRQ_DEC;
+       } else {
+               now = *next_tb - now;
+               if (now <= decrementer_max)
+                       set_dec_or_work(now);
+       }
+}
+EXPORT_SYMBOL_GPL(timer_rearm_host_dec);
+#endif
+
 /*
  * timer_interrupt - gets called when the decrementer overflows,
  * with interrupts disabled.
@@ -608,10 +649,7 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt)
        } else {
                now = *next_tb - now;
                if (now <= decrementer_max)
-                       set_dec(now);
-               /* We may have raced with new irq work */
-               if (test_irq_work_pending())
-                       set_dec(1);
+                       set_dec_or_work(now);
                __this_cpu_inc(irq_stat.timer_irqs_others);
        }
 
@@ -853,11 +891,7 @@ static int decrementer_set_next_event(unsigned long evt,
                                      struct clock_event_device *dev)
 {
        __this_cpu_write(decrementers_next_tb, get_tb() + evt);
-       set_dec(evt);
-
-       /* We may have raced with new irq work */
-       if (test_irq_work_pending())
-               set_dec(1);
+       set_dec_or_work(evt);
 
        return 0;
 }
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index e4482bf546ed..e83c7aa7dbba 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4049,11 +4049,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
        vc->entry_exit_map = 0x101;
        vc->in_guest = 0;
 
-       next_timer = timer_get_next_tb();
-       set_dec(next_timer - tb);
-       /* We may have raced with new irq work */
-       if (test_irq_work_pending())
-               set_dec(1);
+       timer_rearm_host_dec(tb);
+
        mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso);
 
        kvmhv_load_host_pmu();
-- 
2.23.0

Reply via email to