On Tue, Dec 31 2024 at 18:07, Frederic Weisbecker wrote:
> hrtimers are migrated away from the dying CPU to any online target at
> the CPUHP_AP_HRTIMERS_DYING stage in order not to delay bandwidth timers
> handling tasks involved in the CPU hotplug forward progress.
>
> However wake ups can still be performed by the outgoing CPU after
> CPUHP_AP_HRTIMERS_DYING. Those can result again in bandwidth timers
> being armed. Depending on several considerations (crystal ball
> power management based election, earliest timer already enqueued, timer
> migration enabled or not), the target may eventually be the current
> CPU even if offline. If that happens, the timer is eventually ignored.
>
> The most notable example is RCU which had to deal with each an every of
> those wake-ups by deferring them to an online CPU, along with related
> workarounds:
>
> _ e787644caf76 (rcu: Defer RCU kthreads wakeup when CPU is dying)
> _ 9139f93209d1 (rcu/nocb: Fix RT throttling hrtimer armed from offline CPU)
> _ f7345ccc62a4 (rcu/nocb: Fix rcuog wake-up from offline softirq)
>
> The problem isn't confined to RCU though as the stop machine kthread
> (which runs CPUHP_AP_HRTIMERS_DYING) reports its completion at the end
> and performs a wake up that eventually arms the deadline server timer:

Where does it report the completion?

>            WARNING: CPU: 94 PID: 588 at kernel/time/hrtimer.c:1086 
> hrtimer_start_range_ns+0x289/0x2d0
>            CPU: 94 UID: 0 PID: 588 Comm: migration/94 Not tainted
>            Stopper: multi_cpu_stop+0x0/0x120 <- 
> stop_machine_cpuslocked+0x66/0xc0
>            RIP: 0010:hrtimer_start_range_ns+0x289/0x2d0
>            Call Trace:
>             ? __warn+0xcf/0x1b0
>             ? hrtimer_start_range_ns+0x289/0x2d0
>             ? hrtimer_start_range_ns+0x186/0x2d0
>             start_dl_timer+0xfc/0x150
>             enqueue_dl_entity+0x367/0x640
>             dl_server_start+0x53/0xa0
>             enqueue_task_fair+0x363/0x460
>             enqueue_task+0x3c/0x200
>             ttwu_do_activate+0x94/0x240
>             try_to_wake_up+0x315/0x600
>             complete+0x4b/0x80

You trimmed the backtrace at the wrong end. You left all the useless
register gunk around, but removed the call chain leading up to the
completion. :)

I assume it's the complete in stomp_machine() ....

> Instead of providing yet another bandaid to work around the situation,
> fix it from hrtimers infrastructure instead: always migrate away a
> timer to an online target whenever it is enqueued from an offline CPU.

> +/*
> + * If the current CPU is offline and timers have been already
> + * migrated away, make sure not to enqueue locally and perform
> + * a remote retrigger as a last resort.
> + */
> +static void enqueue_hrtimer_offline(struct hrtimer *timer,
> +                                 struct hrtimer_clock_base *base,
> +                                 const enum hrtimer_mode mode)
> +{
> +#ifdef CONFIG_HOTPLUG_CPU
> +     struct hrtimer_cpu_base *new_cpu_base, *old_cpu_base, *this_cpu_base;
> +     struct hrtimer_clock_base *new_base;
> +     int cpu;
> +
> +     old_cpu_base = base->cpu_base;
> +     this_cpu_base = this_cpu_ptr(&hrtimer_bases);
> +
> +     if (old_cpu_base == this_cpu_base || !old_cpu_base->online) {
> +             WARN_ON_ONCE(hrtimer_callback_running(timer));
> +             cpu = cpumask_any_and(cpu_online_mask,
> +                                   housekeeping_cpumask(HK_TYPE_TIMER));
> +             new_cpu_base = &per_cpu(hrtimer_bases, cpu);
> +             new_base = &new_cpu_base->clock_base[base->index];
> +             WRITE_ONCE(timer->base, &migration_base);
> +             raw_spin_unlock(&old_cpu_base->lock);
> +             raw_spin_lock(&new_cpu_base->lock);
> +             WRITE_ONCE(timer->base, new_base);
> +     } else {
> +             new_base = base;
> +             new_cpu_base = new_base->cpu_base;
> +             cpu = new_cpu_base->cpu;
> +     }
> +
> +     if (enqueue_hrtimer(timer, new_base, mode))
> +             smp_call_function_single_async(cpu, &new_cpu_base->csd);

Duh. This reimplementation of switch_hrtimer_base() is really aweful. We
can be smarter than that. Untested patch below.

Thanks,

        tglx
---
--- a/include/linux/hrtimer_defs.h
+++ b/include/linux/hrtimer_defs.h
@@ -125,6 +125,7 @@ struct hrtimer_cpu_base {
        ktime_t                         softirq_expires_next;
        struct hrtimer                  *softirq_next_timer;
        struct hrtimer_clock_base       clock_base[HRTIMER_MAX_CLOCK_BASES];
+       call_single_data_t              csd;
 } ____cacheline_aligned;
 
 
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -58,6 +58,8 @@
 #define HRTIMER_ACTIVE_SOFT    (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
 #define HRTIMER_ACTIVE_ALL     (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
 
+static void retrigger_next_event(void *arg);
+
 /*
  * The timer bases:
  *
@@ -111,7 +113,8 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base,
                        .clockid = CLOCK_TAI,
                        .get_time = &ktime_get_clocktai,
                },
-       }
+       },
+       .csd = CSD_INIT(retrigger_next_event, NULL)
 };
 
 static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
@@ -208,6 +211,13 @@ struct hrtimer_cpu_base *get_target_base
        if (static_branch_likely(&timers_migration_enabled) && !pinned)
                return &per_cpu(hrtimer_bases, get_nohz_timer_target());
 #endif
+#ifdef CONFIG_HOTPLUG_CPU
+       if (unlikely(!base->online)) {
+               int cpu = cpumask_any_and(cpu_online_mask, 
housekeeping_cpumask(HK_TYPE_TIMER));
+
+               return &per_cpu(hrtimer_bases, cpu);
+       }
+#endif
        return base;
 }
 
@@ -254,7 +264,7 @@ switch_hrtimer_base(struct hrtimer *time
                raw_spin_unlock(&base->cpu_base->lock);
                raw_spin_lock(&new_base->cpu_base->lock);
 
-               if (new_cpu_base != this_cpu_base &&
+               if (new_cpu_base != this_cpu_base && this_cpu_base->online &&
                    hrtimer_check_target(timer, new_base)) {
                        raw_spin_unlock(&new_base->cpu_base->lock);
                        raw_spin_lock(&base->cpu_base->lock);
@@ -716,8 +726,6 @@ static inline int hrtimer_is_hres_enable
        return hrtimer_hres_enabled;
 }
 
-static void retrigger_next_event(void *arg);
-
 /*
  * Switch to high resolution mode
  */
@@ -1206,6 +1214,7 @@ static int __hrtimer_start_range_ns(stru
                                    u64 delta_ns, const enum hrtimer_mode mode,
                                    struct hrtimer_clock_base *base)
 {
+       struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
        struct hrtimer_clock_base *new_base;
        bool force_local, first;
 
@@ -1217,10 +1226,16 @@ static int __hrtimer_start_range_ns(stru
         * and enforce reprogramming after it is queued no matter whether
         * it is the new first expiring timer again or not.
         */
-       force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
+       force_local = base->cpu_base == this_cpu_base;
        force_local &= base->cpu_base->next_timer == timer;
 
        /*
+        * Don't force local queuing if this enqueue happens on a unplugged
+        * CPU after hrtimer_cpu_dying() has been invoked.
+        */
+       force_local &= this_cpu_base->online;
+
+       /*
         * Remove an active timer from the queue. In case it is not queued
         * on the current CPU, make sure that remove_hrtimer() updates the
         * remote data correctly.
@@ -1249,8 +1264,17 @@ static int __hrtimer_start_range_ns(stru
        }
 
        first = enqueue_hrtimer(timer, new_base, mode);
-       if (!force_local)
-               return first;
+       if (!force_local) {
+               if (likely(this_cpu_base->online))
+                       return first;
+
+               if (first) {
+                       struct hrtimer_cpu_base *new_cpu_base = 
new_base->cpu_base;
+
+                       smp_call_function_single_async(new_cpu_base->cpu, 
&new_cpu_base->csd);
+               }
+               return 0;
+       }
 
        /*
         * Timer was forced to stay on the current CPU to avoid



Reply via email to