Setting the realtime clock triggers an IPI to all CPUs to reprogram the clock event device.
However, only realtime and TAI clocks have their offsets updated (and therefore potentially require a reprogram). Instead of sending an IPI unconditionally, check each per CPU hrtimer base whether it has active timers in the CLOCK_REALTIME and CLOCK_TAI bases. If that's not the case, update the realtime and TAI base offsets remotely and skip the IPI. This ensures that any subsequently armed timers on CLOCK_REALTIME and CLOCK_TAI are evaluated with the correct offsets. Signed-off-by: Marcelo Tosatti <mtosa...@redhat.com> --- v6: - Do not take softirq_raised into account (Peter Xu). - Include BOOTTIME as base that requires IPI (Thomas). - Unconditional reprogram on resume path, since there is nothing to gain in such path anyway. v5: - Add missing hrtimer_update_base (Peter Xu). v4: - Drop unused code (Thomas). v3: - Nicer changelog (Thomas). - Code style fixes (Thomas). - Compilation warning with CONFIG_HIGH_RES_TIMERS=n (Thomas). - Shrink preemption disabled section (Thomas). v2: - Only REALTIME and TAI bases are affected by offset-to-monotonic changes (Thomas). - Don't special case nohz_full CPUs (Thomas). diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index bb5e7b0a4274..14a6e449b221 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -318,7 +318,7 @@ struct clock_event_device; extern void hrtimer_interrupt(struct clock_event_device *dev); -extern void clock_was_set_delayed(void); +extern void clock_was_set_delayed(bool force_reprogram); extern unsigned int hrtimer_resolution; @@ -326,7 +326,7 @@ extern unsigned int hrtimer_resolution; #define hrtimer_resolution (unsigned int)LOW_RES_NSEC -static inline void clock_was_set_delayed(void) { } +static inline void clock_was_set_delayed(bool force_reprogram) { } #endif @@ -351,7 +351,7 @@ hrtimer_expires_remaining_adjusted(const struct hrtimer *timer) timer->base->get_time()); } -extern void clock_was_set(void); +extern void clock_was_set(bool); #ifdef CONFIG_TIMERFD extern void timerfd_clock_was_set(void); #else diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5c9d968187ae..2258782fd714 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -758,9 +758,17 @@ static void hrtimer_switch_to_hres(void) retrigger_next_event(NULL); } +static void clock_was_set_force_reprogram_work(struct work_struct *work) +{ + clock_was_set(true); +} + +static DECLARE_WORK(hrtimer_force_reprogram_work, clock_was_set_force_reprogram_work); + + static void clock_was_set_work(struct work_struct *work) { - clock_was_set(); + clock_was_set(false); } static DECLARE_WORK(hrtimer_work, clock_was_set_work); @@ -769,9 +777,12 @@ static DECLARE_WORK(hrtimer_work, clock_was_set_work); * Called from timekeeping and resume code to reprogram the hrtimer * interrupt device on all cpus. */ -void clock_was_set_delayed(void) +void clock_was_set_delayed(bool force_reprogram) { - schedule_work(&hrtimer_work); + if (force_reprogram) + schedule_work(&hrtimer_force_reprogram_work); + else + schedule_work(&hrtimer_work); } #else @@ -871,6 +882,18 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) tick_program_event(expires, 1); } +#define CLOCK_SET_BASES ((1U << HRTIMER_BASE_REALTIME) | \ + (1U << HRTIMER_BASE_REALTIME_SOFT) | \ + (1U << HRTIMER_BASE_TAI) | \ + (1U << HRTIMER_BASE_TAI_SOFT) | \ + (1U << HRTIMER_BASE_BOOTTIME) | \ + (1U << HRTIMER_BASE_BOOTTIME_SOFT)) + +static bool need_reprogram_timer(struct hrtimer_cpu_base *cpu_base) +{ + return (cpu_base->active_bases & CLOCK_SET_BASES) != 0; +} + /* * Clock realtime was set * @@ -882,11 +905,42 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) * resolution timer interrupts. On UP we just disable interrupts and * call the high resolution interrupt code. */ -void clock_was_set(void) +void clock_was_set(bool force_reprogram) { #ifdef CONFIG_HIGH_RES_TIMERS - /* Retrigger the CPU local events everywhere */ - on_each_cpu(retrigger_next_event, NULL, 1); + cpumask_var_t mask; + int cpu; + + if (force_reprogram == true) { + on_each_cpu(retrigger_next_event, NULL, 1); + goto set_timerfd; + } + + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { + on_each_cpu(retrigger_next_event, NULL, 1); + goto set_timerfd; + } + + /* Avoid interrupting CPUs if possible */ + cpus_read_lock(); + for_each_online_cpu(cpu) { + unsigned long flags; + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + if (need_reprogram_timer(cpu_base)) + cpumask_set_cpu(cpu, mask); + else + hrtimer_update_base(cpu_base); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + } + + preempt_disable(); + smp_call_function_many(mask, retrigger_next_event, NULL, 1); + preempt_enable(); + cpus_read_unlock(); + free_cpumask_var(mask); +set_timerfd: #endif timerfd_clock_was_set(); } @@ -903,7 +957,7 @@ void hrtimers_resume(void) /* Retrigger on the local CPU */ retrigger_next_event(NULL); /* And schedule a retrigger for all others */ - clock_was_set_delayed(); + clock_was_set_delayed(true); } /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6aee5768c86f..3fef237267bd 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1323,7 +1323,7 @@ int do_settimeofday64(const struct timespec64 *ts) raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ - clock_was_set(); + clock_was_set(false); if (!ret) audit_tk_injoffset(ts_delta); @@ -1371,7 +1371,7 @@ static int timekeeping_inject_offset(const struct timespec64 *ts) raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ - clock_was_set(); + clock_was_set(false); return ret; } @@ -1736,7 +1736,7 @@ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ - clock_was_set(); + clock_was_set(true); } #endif @@ -2187,7 +2187,7 @@ static void timekeeping_advance(enum timekeeping_adv_mode mode) raw_spin_unlock_irqrestore(&timekeeper_lock, flags); if (clock_set) /* Have to call _delayed version, since in irq context*/ - clock_was_set_delayed(); + clock_was_set_delayed(false); } /** @@ -2425,7 +2425,7 @@ int do_adjtimex(struct __kernel_timex *txc) timekeeping_advance(TK_ADV_FREQ); if (tai != orig_tai) - clock_was_set(); + clock_was_set(false); ntp_notify_cmos_timer();