On 1/24/24 4:55 AM, Kirill A. Shutemov wrote:
> MADT Multiprocessor Wakeup structure version 1 brings support of CPU
> offlining: BIOS provides a reset vector where the CPU has to jump to
> for offlining itself. The new TEST mailbox command can be used to test
> whether the CPU offlined itself which means the BIOS has control over
> the CPU and can online it again via the ACPI MADT wakeup method.
>
> Add CPU offling support for the ACPI MADT wakeup method by implementing
> custom cpu_die(), play_dead() and stop_this_cpu() SMP operations.
>
> CPU offlining makes is possible to hand over secondary CPUs over kexec,
> not limiting the second kernel to a single CPU.
>
> The change conforms to the approved ACPI spec change proposal. See the
> Link.
>
> Signed-off-by: Kirill A. Shutemov <[email protected]>
> Link: https://lore.kernel.org/all/13356251.uLZWGnKmhe@kreacher
> ---

Reviewed-by: Kuppuswamy Sathyanarayanan 
<[email protected]>

>  arch/x86/include/asm/acpi.h          |   2 +
>  arch/x86/kernel/acpi/Makefile        |   2 +-
>  arch/x86/kernel/acpi/madt_playdead.S |  28 ++++
>  arch/x86/kernel/acpi/madt_wakeup.c   | 184 ++++++++++++++++++++++++++-
>  include/acpi/actbl2.h                |  15 ++-
>  5 files changed, 227 insertions(+), 4 deletions(-)
>  create mode 100644 arch/x86/kernel/acpi/madt_playdead.S
>
> diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
> index 2625b915ae7f..021cafa214c2 100644
> --- a/arch/x86/include/asm/acpi.h
> +++ b/arch/x86/include/asm/acpi.h
> @@ -81,6 +81,8 @@ union acpi_subtable_headers;
>  int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
>                             const unsigned long end);
>  
> +void asm_acpi_mp_play_dead(u64 reset_vector, u64 pgd_pa);
> +
>  /*
>   * Check if the CPU can handle C2 and deeper
>   */
> diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
> index 8c7329c88a75..37b1f28846de 100644
> --- a/arch/x86/kernel/acpi/Makefile
> +++ b/arch/x86/kernel/acpi/Makefile
> @@ -4,7 +4,7 @@ obj-$(CONFIG_ACPI)                    += boot.o
>  obj-$(CONFIG_ACPI_SLEEP)             += sleep.o wakeup_$(BITS).o
>  obj-$(CONFIG_ACPI_APEI)                      += apei.o
>  obj-$(CONFIG_ACPI_CPPC_LIB)          += cppc.o
> -obj-$(CONFIG_X86_ACPI_MADT_WAKEUP)   += madt_wakeup.o
> +obj-$(CONFIG_X86_ACPI_MADT_WAKEUP)   += madt_wakeup.o madt_playdead.o
>  
>  ifneq ($(CONFIG_ACPI_PROCESSOR),)
>  obj-y                                        += cstate.o
> diff --git a/arch/x86/kernel/acpi/madt_playdead.S 
> b/arch/x86/kernel/acpi/madt_playdead.S
> new file mode 100644
> index 000000000000..4e498d28cdc8
> --- /dev/null
> +++ b/arch/x86/kernel/acpi/madt_playdead.S
> @@ -0,0 +1,28 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#include <linux/linkage.h>
> +#include <asm/nospec-branch.h>
> +#include <asm/page_types.h>
> +#include <asm/processor-flags.h>
> +
> +     .text
> +     .align PAGE_SIZE
> +
> +/*
> + * asm_acpi_mp_play_dead() - Hand over control of the CPU to the BIOS
> + *
> + * rdi: Address of the ACPI MADT MPWK ResetVector
> + * rsi: PGD of the identity mapping
> + */
> +SYM_FUNC_START(asm_acpi_mp_play_dead)
> +     /* Turn off global entries. Following CR3 write will flush them. */
> +     movq    %cr4, %rdx
> +     andq    $~(X86_CR4_PGE), %rdx
> +     movq    %rdx, %cr4
> +
> +     /* Switch to identity mapping */
> +     movq    %rsi, %cr3
> +
> +     /* Jump to reset vector */
> +     ANNOTATE_RETPOLINE_SAFE
> +     jmp     *%rdi
> +SYM_FUNC_END(asm_acpi_mp_play_dead)
> diff --git a/arch/x86/kernel/acpi/madt_wakeup.c 
> b/arch/x86/kernel/acpi/madt_wakeup.c
> index 30820f9de5af..9e984e2191ba 100644
> --- a/arch/x86/kernel/acpi/madt_wakeup.c
> +++ b/arch/x86/kernel/acpi/madt_wakeup.c
> @@ -1,10 +1,19 @@
>  // SPDX-License-Identifier: GPL-2.0-or-later
>  #include <linux/acpi.h>
>  #include <linux/cpu.h>
> +#include <linux/delay.h>
>  #include <linux/io.h>
> +#include <linux/kexec.h>
> +#include <linux/memblock.h>
> +#include <linux/pgtable.h>
> +#include <linux/sched/hotplug.h>
>  #include <asm/apic.h>
>  #include <asm/barrier.h>
> +#include <asm/init.h>
> +#include <asm/intel_pt.h>
> +#include <asm/nmi.h>
>  #include <asm/processor.h>
> +#include <asm/reboot.h>
>  
>  /* Physical address of the Multiprocessor Wakeup Structure mailbox */
>  static u64 acpi_mp_wake_mailbox_paddr __ro_after_init;
> @@ -12,6 +21,154 @@ static u64 acpi_mp_wake_mailbox_paddr __ro_after_init;
>  /* Virtual address of the Multiprocessor Wakeup Structure mailbox */
>  static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox 
> __ro_after_init;
>  
> +static u64 acpi_mp_pgd __ro_after_init;
> +static u64 acpi_mp_reset_vector_paddr __ro_after_init;
> +
> +static void acpi_mp_stop_this_cpu(void)
> +{
> +     asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
> +}
> +
> +static void acpi_mp_play_dead(void)
> +{
> +     play_dead_common();
> +     asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
> +}
> +
> +static void acpi_mp_cpu_die(unsigned int cpu)
> +{
> +     u32 apicid = per_cpu(x86_cpu_to_apicid, cpu);
> +     unsigned long timeout;
> +
> +     /*
> +      * Use TEST mailbox command to prove that BIOS got control over
> +      * the CPU before declaring it dead.
> +      *
> +      * BIOS has to clear 'command' field of the mailbox.
> +      */
> +     acpi_mp_wake_mailbox->apic_id = apicid;
> +     smp_store_release(&acpi_mp_wake_mailbox->command,
> +                       ACPI_MP_WAKE_COMMAND_TEST);
> +
> +     /* Don't wait longer than a second. */
> +     timeout = USEC_PER_SEC;
> +     while (READ_ONCE(acpi_mp_wake_mailbox->command) && --timeout)
> +             udelay(1);
> +
> +     if (!timeout)
Nit: IMO, since you are dumping failure error message (not timeout
message), you can use non zero acpi_mp_wake_mailbox->command
check. But it is up to you.
> +             pr_err("Failed to hand over CPU %d to BIOS\n", cpu);
> +}
> +
> +/* The argument is required to match type of 
> x86_mapping_info::alloc_pgt_page */
> +static void __init *alloc_pgt_page(void *dummy)
> +{
> +     return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
> +}
> +
> +static void __init free_pgt_page(void *pgt, void *dummy)
> +{
> +     return memblock_free(pgt, PAGE_SIZE);
> +}
> +
> +/*
> + * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at
> + * the same place as in the kernel page tables. asm_acpi_mp_play_dead() 
> switches
> + * to the identity mapping and the function has be present at the same spot 
> in
> + * the virtual address space before and after switching page tables.
> + */
> +static int __init init_transition_pgtable(pgd_t *pgd)
> +{
> +     pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
> +     unsigned long vaddr, paddr;
> +     p4d_t *p4d;
> +     pud_t *pud;
> +     pmd_t *pmd;
> +     pte_t *pte;
> +
> +     vaddr = (unsigned long)asm_acpi_mp_play_dead;
> +     pgd += pgd_index(vaddr);
> +     if (!pgd_present(*pgd)) {
> +             p4d = (p4d_t *)alloc_pgt_page(NULL);
> +             if (!p4d)
> +                     return -ENOMEM;
> +             set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
> +     }
> +     p4d = p4d_offset(pgd, vaddr);
> +     if (!p4d_present(*p4d)) {
> +             pud = (pud_t *)alloc_pgt_page(NULL);
> +             if (!pud)
> +                     return -ENOMEM;
> +             set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
> +     }
> +     pud = pud_offset(p4d, vaddr);
> +     if (!pud_present(*pud)) {
> +             pmd = (pmd_t *)alloc_pgt_page(NULL);
> +             if (!pmd)
> +                     return -ENOMEM;
> +             set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
> +     }
> +     pmd = pmd_offset(pud, vaddr);
> +     if (!pmd_present(*pmd)) {
> +             pte = (pte_t *)alloc_pgt_page(NULL);
> +             if (!pte)
> +                     return -ENOMEM;
> +             set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
> +     }
> +     pte = pte_offset_kernel(pmd, vaddr);
> +
> +     paddr = __pa(vaddr);
> +     set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
> +
> +     return 0;
> +}
> +
> +static int __init acpi_mp_setup_reset(u64 reset_vector)
> +{
> +     pgd_t *pgd;
> +     struct x86_mapping_info info = {
> +             .alloc_pgt_page = alloc_pgt_page,
> +             .free_pgt_page  = free_pgt_page,
> +             .page_flag      = __PAGE_KERNEL_LARGE_EXEC,
> +             .kernpg_flag    = _KERNPG_TABLE_NOENC,
> +     };
> +
> +     pgd = alloc_pgt_page(NULL);
> +     if (!pgd)
> +             return -ENOMEM;
> +
> +     for (int i = 0; i < nr_pfn_mapped; i++) {
> +             unsigned long mstart, mend;
> +
> +             mstart = pfn_mapped[i].start << PAGE_SHIFT;
> +             mend   = pfn_mapped[i].end << PAGE_SHIFT;
> +             if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
> +                     kernel_ident_mapping_free(&info, pgd);
> +                     return -ENOMEM;
> +             }
> +     }
> +
> +     if (kernel_ident_mapping_init(&info, pgd,
> +                                   PAGE_ALIGN_DOWN(reset_vector),
> +                                   PAGE_ALIGN(reset_vector + 1))) {
> +             kernel_ident_mapping_free(&info, pgd);
> +             return -ENOMEM;
> +     }
> +
> +     if (init_transition_pgtable(pgd)) {
> +             kernel_ident_mapping_free(&info, pgd);
> +             return -ENOMEM;
> +     }
> +
> +     smp_ops.play_dead = acpi_mp_play_dead;
> +     smp_ops.stop_this_cpu = acpi_mp_stop_this_cpu;
> +     smp_ops.cpu_die = acpi_mp_cpu_die;
> +
> +     acpi_mp_reset_vector_paddr = reset_vector;
> +     acpi_mp_pgd = __pa(pgd);
> +
> +     return 0;
> +}
> +
>  static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
>  {
>       if (!acpi_mp_wake_mailbox_paddr) {
> @@ -97,14 +254,37 @@ int __init acpi_parse_mp_wake(union 
> acpi_subtable_headers *header,
>       struct acpi_madt_multiproc_wakeup *mp_wake;
>  
>       mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
> -     if (BAD_MADT_ENTRY(mp_wake, end))
> +
> +     /*
> +      * Cannot use the standard BAD_MADT_ENTRY() to sanity check the @mp_wake
> +      * entry.  'sizeof (struct acpi_madt_multiproc_wakeup)' can be larger
> +      * than the actual size of the MP wakeup entry in ACPI table because the
> +      * 'reset_vector' is only available in the V1 MP wakeup structure.
> +      */
> +     if (!mp_wake)
> +             return -EINVAL;
> +     if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0)
> +             return -EINVAL;
> +     if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0)
>               return -EINVAL;
>  
>       acpi_table_print_madt_entry(&header->common);
>  
>       acpi_mp_wake_mailbox_paddr = mp_wake->mailbox_address;
>  
> -     acpi_mp_disable_offlining(mp_wake);
> +     if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 &&
> +         mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) {
> +             if (acpi_mp_setup_reset(mp_wake->reset_vector)) {
> +                     pr_warn("Failed to setup MADT reset vector\n");
> +                     acpi_mp_disable_offlining(mp_wake);
> +             }
> +     } else {
> +             /*
> +              * CPU offlining requires version 1 of the ACPI MADT wakeup
> +              * structure.
> +              */
> +             acpi_mp_disable_offlining(mp_wake);
> +     }
>  
>       apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu);
>  
> diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
> index e1a395af7591..2aedda70ef88 100644
> --- a/include/acpi/actbl2.h
> +++ b/include/acpi/actbl2.h
> @@ -1120,8 +1120,20 @@ struct acpi_madt_multiproc_wakeup {
>       u16 version;
>       u32 reserved;           /* reserved - must be zero */
>       u64 mailbox_address;
> +     u64 reset_vector;
>  };
>  
> +/* Values for Version field above */
> +
> +enum acpi_madt_multiproc_wakeup_version {
> +     ACPI_MADT_MP_WAKEUP_VERSION_NONE = 0,
> +     ACPI_MADT_MP_WAKEUP_VERSION_V1 = 1,
> +     ACPI_MADT_MP_WAKEUP_VERSION_RESERVED = 2, /* 2 and greater are reserved 
> */
> +};
> +
> +#define ACPI_MADT_MP_WAKEUP_SIZE_V0  16
> +#define ACPI_MADT_MP_WAKEUP_SIZE_V1  24
> +
>  #define ACPI_MULTIPROC_WAKEUP_MB_OS_SIZE        2032
>  #define ACPI_MULTIPROC_WAKEUP_MB_FIRMWARE_SIZE  2048
>  
> @@ -1134,7 +1146,8 @@ struct acpi_madt_multiproc_wakeup_mailbox {
>       u8 reserved_firmware[ACPI_MULTIPROC_WAKEUP_MB_FIRMWARE_SIZE];   /* 
> reserved for firmware use */
>  };
>  
> -#define ACPI_MP_WAKE_COMMAND_WAKEUP    1
> +#define ACPI_MP_WAKE_COMMAND_WAKEUP  1
> +#define ACPI_MP_WAKE_COMMAND_TEST    2
>  
>  /* 17: CPU Core Interrupt Controller (ACPI 6.5) */
>  

-- 
Sathyanarayanan Kuppuswamy
Linux Kernel Developer


_______________________________________________
kexec mailing list
[email protected]
http://lists.infradead.org/mailman/listinfo/kexec

Reply via email to