Hi Patrick, On Wed, 11 Sept 2024 at 00:26, Patrick Rudolph <[email protected]> wrote: > > On Arm platforms that use ACPI they cannot rely on the "spin-table" > CPU bringup usually defined in the FDT. Thus implement the > 'ACPI Multi-processor Startup for ARM Platforms', also referred to as > 'ACPI parking protocol'. > > The ACPI parking protocol works similar to the spin-table mechanism, but > the specification also covers lots of shortcomings of the spin-table > implementations. > > Every CPU defined in the ACPI MADT table has it's own 4K page where the > spinloop code and the OS mailbox resides. When selected the U-Boot board > code must make sure that the secondary CPUs enter u-boot after relocation > as well, so that they can enter the spinloop code residing in the ACPI > parking protocol pages. > > The OS will then write to the mailbox and generate an IPI to release the > CPUs from the spinloop code. > > For now it's only implemented on ARMv8, but can easily be extended to > other platforms. > > TEST: Boots all CPUs on qemu-system-aarch64 -machine raspi4b > > Signed-off-by: Patrick Rudolph <[email protected]> > Cc: Simon Glass <[email protected]> > Cc: Tom Rini <[email protected]> > --- > Changelog v2: > - Use shorter function names > - Drop the use of atomics, they do not work on real hardware > - Rewrite code and verify on real hardware that CPUs are spinning > inside their parking protocol spin-loop code > > --- > arch/arm/cpu/armv8/Makefile | 1 + > arch/arm/cpu/armv8/acpi_park_v8.S | 107 ++++++++++++++++++++++++++ > arch/arm/cpu/armv8/start.S | 4 + > arch/arm/include/asm/acpi_table.h | 45 +++++++++++ > arch/arm/lib/acpi_table.c | 123 ++++++++++++++++++++++++++++++ > include/acpi/acpi_table.h | 10 +++ > include/bloblist.h | 1 + > lib/Kconfig | 15 ++++ > lib/acpi/acpi_table.c | 4 + > 9 files changed, 310 insertions(+) > create mode 100644 arch/arm/cpu/armv8/acpi_park_v8.S
Reviewed-by: Simon Glass <[email protected]> Just some nits, looks very tidy. > > diff --git a/arch/arm/cpu/armv8/Makefile b/arch/arm/cpu/armv8/Makefile > index bba4f570db..4f4368ff8c 100644 > --- a/arch/arm/cpu/armv8/Makefile > +++ b/arch/arm/cpu/armv8/Makefile > @@ -29,6 +29,7 @@ obj-$(CONFIG_ARM_SMCCC) += smccc-call.o > > ifndef CONFIG_SPL_BUILD > obj-$(CONFIG_ARMV8_SPIN_TABLE) += spin_table.o spin_table_v8.o > +obj-$(CONFIG_ACPI_PARKING_PROTOCOL) += acpi_park_v8.o > else > obj-$(CONFIG_ARCH_SUNXI) += fel_utils.o > endif > diff --git a/arch/arm/cpu/armv8/acpi_park_v8.S > b/arch/arm/cpu/armv8/acpi_park_v8.S > new file mode 100644 > index 0000000000..4dc7ff216c > --- /dev/null > +++ b/arch/arm/cpu/armv8/acpi_park_v8.S > @@ -0,0 +1,107 @@ > +/* SPDX-License-Identifier: GPL-2.0+ */ > +/* Could you add a comment here about what this file is for? > + * Copyright (C) 2024 9elements GmbH > + * Author: Patrick Rudolph <[email protected]> > + */ > + > +#include <linux/linkage.h> > +#include <asm/acpi_table.h> > + > +/* Filled by C code */ > +.global acpi_pp_tables > +acpi_pp_tables: > + .quad 0 > + > +.global acpi_pp_etables > +acpi_pp_etables: > + .quad 0 > + > +/* Read by C code */ > +.global acpi_pp_code_size > +acpi_pp_code_size: > + .word __secondary_pp_code_end - __secondary_pp_code_start > + > +.global acpi_pp_secondary_jump > +ENTRY(acpi_pp_secondary_jump) > +0: > + /* > + * Cannot use atomic operations since the MMU and D-cache > + * might be off. Use the MPIDR instead to find the spintable. > + */ > + > + /* Check if parking protocol table is ready */ > + ldr x1, =acpi_pp_tables > + ldr x0, [x1] > + cbnz x0, 0f > + wfe > + b 0b > + > +0: /* Get end of page tables in x3 */ > + ldr x1, =acpi_pp_etables > + ldr x3, [x1] > + > + /* Get own CPU ID in w2 */ > + mrs x2, mpidr_el1 > + lsr x9, x2, #32 > + bfi x2, x9, #24, #8 /* w2 is aff3:aff2:aff1:aff0 */ > + > +0: /* Loop over all parking protocol pages */ > + cmp x0, x3 > + b.ge hlt > + > + /* Fetch CPU_ID from current page */ > + ldr x1, [x0, #ACPI_PP_CPU_ID_OFFSET] > + lsr x9, x1, #32 > + bfi x1, x9, #24, #8 /* w1 is aff3:aff2:aff1:aff0 */ > + > + /* Compare CPU_IDs */ > + cmp w1, w2 > + b.eq 0f > + > + add x0, x0, #ACPI_PP_PAGE_SIZE > + b 0b > + > +hlt: wfi > + b hlt /* Should never happen. */ > + > +0: /* x0 points to the 4K aligned parking protocol page */ > + add x2, x0, #ACPI_PP_CPU_CODE_OFFSET > + > + /* Jump to spin code in own parking protocol page */ > + br x2 > +ENDPROC(acpi_pp_secondary_jump) > + > +.align 8 > +__secondary_pp_code_start: > +.global acpi_pp_code_start > +ENTRY(acpi_pp_code_start) > + /* x0 points to the 4K aligned parking protocol page */ "4K-aligned, parking-protocol page" > + > + /* Prepare defines for spinning code */ > + mov w3, #ACPI_PP_CPU_ID_INVALID > + mov x2, #ACPI_PP_JMP_ADR_INVALID > + > + /* Mark parking protocol page as ready */ > + str w3, [x0, #ACPI_PP_CPU_ID_OFFSET] > + dsb sy > + > +0: wfe > + ldr w1, [x0, #ACPI_PP_CPU_ID_OFFSET] > + > + /* Check CPU ID is not invalid */ > + cmp w1, w3 > + b.eq 0b > + > + /* Check jump address not invalid */ s/not invalid/valid/ ? > + ldr x1, [x0, #ACPI_PP_CPU_JMP_ADDR_OFFSET] > + cmp x1, x2 > + b.eq 0b > + > + /* Clear jump address before jump */ > + str x2, [x0, #ACPI_PP_CPU_JMP_ADDR_OFFSET] > + dsb sy > + > + br x1 > +ENDPROC(acpi_pp_code_start) > + /* Secondary Boot Code ends here */ > +__secondary_pp_code_end: > diff --git a/arch/arm/cpu/armv8/start.S b/arch/arm/cpu/armv8/start.S The changes to this file should really go in a separate commit since this is generic code. > index 7461280261..f56870768e 100644 > --- a/arch/arm/cpu/armv8/start.S > +++ b/arch/arm/cpu/armv8/start.S > @@ -178,6 +178,10 @@ pie_fixup_done: > branch_if_master x0, master_cpu > b spin_table_secondary_jump > /* never return */ > +#elif defined(CONFIG_ACPI_PARKING_PROTOCOL) && !defined(CONFIG_SPL_BUILD) A comment about what this is doing would be nice > + branch_if_master x0, master_cpu > + b acpi_pp_secondary_jump > + /* never return */ > #elif defined(CONFIG_ARMV8_MULTIENTRY) > branch_if_master x0, master_cpu > > diff --git a/arch/arm/include/asm/acpi_table.h > b/arch/arm/include/asm/acpi_table.h > index c65eabe837..c9ea7e04b4 100644 > --- a/arch/arm/include/asm/acpi_table.h > +++ b/arch/arm/include/asm/acpi_table.h > @@ -4,6 +4,7 @@ > #define __ASM_ACPI_TABLE_H__ > > #ifndef __ACPI__ > +#ifndef __ASSEMBLY__ > > #include <acpi/acpi_table.h> > > @@ -109,7 +110,51 @@ int acpi_pptt_add_cache(struct acpi_ctx *ctx, const u32 > flags, > const u32 sets, const u8 assoc, const u8 attributes, > const u16 line_size); > > +/* Multi-processor Startup for ARM Platforms */ > +struct acpi_parking_protocol_page { > + u64 cpu_id; What is the CPI ID, actually? How do you obtain it? Can you add a comment for this struct? > + u64 jumping_address; ulong entry ? > + u8 os_reserved[2032]; > + u8 cpu_spinning_code[2048]; I don't know of any other kind of spinning code that doesn't run on a cpu...so how about spin_code or spinning_code? > +} __packed; > + > +/* Architecture specific functions */ > +/** > + * acpi_pp_code_size - Spinloop code size * > + */ > +extern u16 acpi_pp_code_size; Please avoid extern in header files > + > +/** > + * acpi_pp_tables - Start of ACPI PP tables. > + */ > +extern u64 acpi_pp_tables; Is this used on 32-bit machines? Judging by the armv8 I would say not, so can we just use ulong here instead of u64? > + > +/** > + * acpi_pp_etables - End of ACPI PP tables. > + */ > +extern u64 acpi_pp_etables; > + > +/** > + * acpi_pp_code_start() - Spinloop code > + * > + * Architectural spinloop code to be installed in each parking protocol > + * page. The spinloop code must be less than 2048 bytes. > + * > + * The spinloop code will be entered after calling > + * acpi_parking_protocol_install(). > + * > + */ > +void acpi_pp_code_start(void); > + > #endif /* !__ASSEMBLY__ */ > #endif /* !__ACPI__ */ > > +#define ACPI_PP_CPU_ID_INVALID 0xffffffff > +#define ACPI_PP_JMP_ADR_INVALID 0 > +#define ACPI_PP_PAGE_SIZE 4096 > +#define ACPI_PP_CPU_ID_OFFSET 0 > +#define ACPI_PP_CPU_JMP_ADDR_OFFSET 8 Why do you need ADDR_OFFSET ? It is normally either an address or an offset in U-Boot. Would ACPI_PP_CPU_JMP_OFFSET be OK? > +#define ACPI_PP_CPU_CODE_OFFSET 2048 > +#define ACPI_PP_VERSION 1 > + > #endif /* __ASM_ACPI_TABLE_H__ */ > diff --git a/arch/arm/lib/acpi_table.c b/arch/arm/lib/acpi_table.c > index 75a0bc5b6e..d648d85049 100644 > --- a/arch/arm/lib/acpi_table.c > +++ b/arch/arm/lib/acpi_table.c > @@ -10,8 +10,16 @@ > #include <acpi/acpigen.h> > #include <acpi/acpi_device.h> > #include <acpi/acpi_table.h> > +#include <asm-generic/io.h> > #include <dm/acpi.h> > +#include <bloblist.h> > +#include <cpu_func.h> > +#include <efi_loader.h> > +#include <linux/log2.h> > +#include <linux/sizes.h> > +#include <malloc.h> > #include <string.h> > +#include <tables_csum.h> header order > > void acpi_write_madt_gicc(struct acpi_madt_gicc *gicc, uint cpu_num, > uint perf_gsiv, ulong phys_base, ulong gicv, > @@ -120,3 +128,118 @@ __weak void *acpi_fill_madt(struct acpi_madt *madt, > struct acpi_ctx *ctx) > acpi_fill_madt_subtbl(ctx); > return ctx->current; > } > + > +/** > + * acpi_write_pp_setup_one_page() - Fill out one page used by the PP > + * > + * Fill out the struct acpi_parking_protocol_page to contain the spin-loop > + * code and the mailbox area. After this function the page is ready for > + * the secondary core's to enter the spin-loop code. > + * > + * @page: Pointer to current parking protocol page > + * @gicc: Pointer to corresponding GICC sub-table > + */ > +static void acpi_write_pp_setup_one_page(struct acpi_parking_protocol_page > *page, acpi_parking_protocol_page is way too long How about acpi_park_page ? > + struct acpi_madt_gicc *gicc) > +{ > + void *reloc_addr; This is a pointer, not an address. An address is always ulong. So just 'reloc' > + > + /* Update GICC. Mark parking protocol as available. */ > + gicc->parking_proto = ACPI_PP_VERSION; > + gicc->parked_addr = virt_to_phys(page); > + > + /* Prepare parking protocol page */ > + memset(page, '\0', sizeof(struct acpi_parking_protocol_page)); > + > + /* Init mailbox. Set MPIDR so core's will find their page. */ > + page->cpu_id = gicc->mpidr; > + page->jumping_address = ACPI_PP_JMP_ADR_INVALID; > + > + /* Relocate spinning code */ > + reloc_addr = &page->cpu_spinning_code[0]; > + > + debug("Relocating spin table from %p to %p (size %x)\n", > + &acpi_pp_code_start, reloc_addr, acpi_pp_code_size); Can you use %lx to print addresses? It avoids 16 digits which makes them unreadable. Also we generally show addresses for U-Boot, so you can use map_to_sysmem() to turn a pointer into an address. > + memcpy(reloc_addr, &acpi_pp_code_start, acpi_pp_code_size); > + > + if (!CONFIG_IS_ENABLED(SYS_DCACHE_OFF)) > + flush_dcache_range((unsigned long)page, > + (unsigned long)(page + 1)); > +} > + > +void acpi_write_parking_protocol(struct acpi_madt *madt) > +{ > + struct acpi_parking_protocol_page *start, *page; > + struct acpi_madt_gicc *gicc; > + int ncpus = 0; > + > + /* According to the "Multi-processor Startup for ARM Platforms": /* * - According... > + * - Every CPU as specified by MADT GICC has it's own 4K page > + * - Every page is divided into two sections: OS and FW reserved > + * - Memory occupied by "Parking Protocol" must be marked 'Reserved' > + * - Spinloop code should reside in FW reserved 2048 bytes > + * - Spinloop code will check the mailbox in OS reserved area > + */ > + > + if (acpi_pp_code_size > sizeof(page->cpu_spinning_code)) { > + log_err("Spinning code too big to fit: %d\n", > + acpi_pp_code_size); > + return; > + } > + > + /* Count all MADT GICCs including BSP */ > + for (int i = sizeof(struct acpi_madt); i < madt->header.length; We normally declare the var at the top of functions. > + i += gicc->length) { > + gicc = (struct acpi_madt_gicc *)((void *)madt + i); > + if (gicc->type != ACPI_APIC_GICC) > + continue; > + ncpus++; > + } > + debug("Found %d GICCs in MADT\n", ncpus); %#x ? There could be alot. Up to you though > + > + /* Allocate pages linearly due to assembly code requirements */ > + if (IS_ENABLED(CONFIG_BLOBLIST_TABLES)) { > + start = bloblist_add(BLOBLISTT_ACPI_PP, ACPI_PP_PAGE_SIZE * > ncpus, > + ilog2(SZ_4K)); > + } else { > + start = memalign(ACPI_PP_PAGE_SIZE, ACPI_PP_PAGE_SIZE * > ncpus); Let's use the bloblist always... > + } > + if (!start) { > + log_err("Failed to allocate memory for ACPI parking protocol > pages\n"); ACPI-parking-protocol pages > + return; > + } > + log_debug("Allocated parking protocol at %p\n", start); > + page = start; > + > + if (IS_ENABLED(CONFIG_EFI_LOADER)) { > + /* Default mapping is 'BOOT CODE'. Mark as reserved instead. > */ > + int ret = efi_add_memory_map((u64)(uintptr_t)start, > + ncpus * ACPI_PP_PAGE_SIZE, > + EFI_RESERVED_MEMORY_TYPE); > + > + if (ret != EFI_SUCCESS) if (ret) > + log_err("Reserved memory mapping failed addr %p size > %x\n", > + start, ncpus * ACPI_PP_PAGE_SIZE); > + } > + > + /* Prepare the parking protocol pages */ > + for (int i = sizeof(struct acpi_madt); i < madt->header.length; decl again > + i += gicc->length) { > + gicc = (struct acpi_madt_gicc *)((void *)madt + i); > + if (gicc->type != ACPI_APIC_GICC) > + continue; > + > + acpi_write_pp_setup_one_page(page++, gicc); > + } > + > + acpi_pp_etables = virt_to_phys(start) + > + ACPI_PP_PAGE_SIZE * ncpus; > + acpi_pp_tables = virt_to_phys(start); > + > + /* Make sure other cores see written value in memory */ > + flush_dcache_all(); Is the cache ever disabled? > + > + /* Send an event to wake up the secondary CPU. */ > + asm("dsb ishst\n" > + "sev"); > +} > diff --git a/include/acpi/acpi_table.h b/include/acpi/acpi_table.h > index 3dd24b520d..abf69f16ba 100644 > --- a/include/acpi/acpi_table.h > +++ b/include/acpi/acpi_table.h > @@ -1237,6 +1237,16 @@ int acpi_iort_add_smmu_v3(struct acpi_ctx *ctx, > */ > void *acpi_fill_madt(struct acpi_madt *madt, struct acpi_ctx *ctx); > > +/** > + * acpi_write_parking_protocol() - Installs the ACPI parking protocol. > + * > + * Sets up the ACPI parking protocol and installs the spinning code for > + * secondary CPUs. > + * > + * @madt: The MADT to update > + */ > +void acpi_write_parking_protocol(struct acpi_madt *madt); acpi_write_park() ? > + > /** > * acpi_get_rsdp_addr() - get ACPI RSDP table address > * > diff --git a/include/bloblist.h b/include/bloblist.h > index b0706b5637..ff32d3fecf 100644 > --- a/include/bloblist.h > +++ b/include/bloblist.h > @@ -110,6 +110,7 @@ enum bloblist_tag_t { > BLOBLISTT_ACPI_TABLES = 4, > BLOBLISTT_TPM_EVLOG = 5, > BLOBLISTT_TPM_CRB_BASE = 6, > + BLOBLISTT_ACPI_PP = 7, > > /* Standard area to allocate blobs used across firmware components */ > BLOBLISTT_AREA_FIRMWARE = 0x10, > diff --git a/lib/Kconfig b/lib/Kconfig > index ea444354eb..a93b494288 100644 > --- a/lib/Kconfig > +++ b/lib/Kconfig > @@ -314,6 +314,21 @@ config GENERATE_ACPI_TABLE > by the operating system. It defines platform-independent interfaces > for configuration and power management monitoring. > > +config ACPI_PARKING_PROTOCOL > + bool "Support ACPI parking protocol method" > + depends on GENERATE_ACPI_TABLE > + depends on ARMV8_MULTIENTRY > + default y if !SEC_FIRMWARE_ARMV8_PSCI && !ARMV8_PSCI > + help > + Say Y here to support "ACPI parking protocol" enable method > + for booting Linux. > + > + To use this feature, you must do: > + - Bring secondary CPUs into U-Boot proper in a board specific board-specific > + manner. This must be done *after* relocation. Otherwise, the > + secondary CPUs will spin in unprotected memory area because the in an unprotected memory-area > + master CPU protects the relocated spin code. > + > config SPL_TINY_MEMSET > bool "Use a very small memset() in SPL" > depends on SPL > diff --git a/lib/acpi/acpi_table.c b/lib/acpi/acpi_table.c > index 8aab41212a..7b419144dc 100644 > --- a/lib/acpi/acpi_table.c > +++ b/lib/acpi/acpi_table.c > @@ -290,6 +290,10 @@ int acpi_write_madt(struct acpi_ctx *ctx, const struct > acpi_writer *entry) > > /* (Re)calculate length and checksum */ > header->length = (uintptr_t)current - (uintptr_t)madt; Can you cast to void * instead? We try to avoid casts between pointer and int and use map_sysmem() / map_to_sysmem() instead. > + > + if (IS_ENABLED(CONFIG_ACPI_PARKING_PROTOCOL)) > + acpi_write_parking_protocol(madt); > + > header->checksum = table_compute_checksum((void *)madt, > header->length); > acpi_add_table(ctx, madt); > ctx->current = (void *)madt + madt->header.length; > -- > 2.46.0 > Regards, Simon

