kexec_prepare_cpus_wait() calls get_cpu() internally to obtain the
current CPU id. kexec_prepare_cpus() calls kexec_prepare_cpus_wait()
twice -- once for KEXEC_STATE_IRQS_OFF and once for
KEXEC_STATE_REAL_MODE -- but only issues a single put_cpu() at the end,
leaving preempt_count elevated by one extra nesting level.

In practice the imbalance does not trigger a 'scheduling while atomic'
splat because the kexec path is a one-way trip: IRQs are already
disabled, no schedule() occurs after the leak, and
default_machine_kexec() overwrites preempt_count with HARDIRQ_OFFSET
before jumping into kexec_sequence() which never returns. However the
bookkeeping is still wrong.

Lift the get_cpu()/put_cpu() pair into kexec_prepare_cpus() so it is
called exactly once, and pass the CPU id to kexec_prepare_cpus_wait()
as a parameter. This keeps preempt_count correctly balanced.

Fixes: 1fc711f7ffb01 ("powerpc/kexec: Fix race in kexec shutdown")
Signed-off-by: Aboorva Devarajan <[email protected]>
---
 arch/powerpc/kexec/core_64.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c
index 825ab8a88f18e..9d7e5a1e6e5b8 100644
--- a/arch/powerpc/kexec/core_64.c
+++ b/arch/powerpc/kexec/core_64.c
@@ -164,12 +164,11 @@ static void kexec_smp_down(void *arg)
        /* NOTREACHED */
 }
 
-static void kexec_prepare_cpus_wait(int wait_state)
+static void kexec_prepare_cpus_wait(int wait_state, int my_cpu)
 {
-       int my_cpu, i, notified=-1;
+       int i, notified = -1;
 
        hw_breakpoint_disable();
-       my_cpu = get_cpu();
        /* Make sure each CPU has at least made it to the state we need.
         *
         * FIXME: There is a (slim) chance of a problem if not all of the CPUs
@@ -246,6 +245,8 @@ static void wake_offline_cpus(void)
 
 static void kexec_prepare_cpus(void)
 {
+       int my_cpu;
+
        wake_offline_cpus();
        smp_call_function(kexec_smp_down, NULL, /* wait */0);
        local_irq_disable();
@@ -254,7 +255,8 @@ static void kexec_prepare_cpus(void)
        mb(); /* make sure IRQs are disabled before we say they are */
        get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF;
 
-       kexec_prepare_cpus_wait(KEXEC_STATE_IRQS_OFF);
+       my_cpu = get_cpu();
+       kexec_prepare_cpus_wait(KEXEC_STATE_IRQS_OFF, my_cpu);
        /* we are sure every CPU has IRQs off at this point */
        kexec_all_irq_disabled = 1;
 
@@ -262,13 +264,12 @@ static void kexec_prepare_cpus(void)
         * Before removing MMU mappings make sure all CPUs have entered real
         * mode:
         */
-       kexec_prepare_cpus_wait(KEXEC_STATE_REAL_MODE);
+       kexec_prepare_cpus_wait(KEXEC_STATE_REAL_MODE, my_cpu);
+       put_cpu();
 
        /* after we tell the others to go down */
        if (ppc_md.kexec_cpu_down)
                ppc_md.kexec_cpu_down(0, 0);
-
-       put_cpu();
 }
 
 #else /* ! SMP */
-- 
2.54.0


Reply via email to