The branch stable/13 has been updated by kib:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=c946f699856f6737a5256d7c9f746ac8035339ee

commit c946f699856f6737a5256d7c9f746ac8035339ee
Author:     Konstantin Belousov <[email protected]>
AuthorDate: 2021-07-10 19:38:42 +0000
Commit:     Konstantin Belousov <[email protected]>
CommitDate: 2021-08-23 23:21:12 +0000

    amd64: rework AP startup
    
    (cherry picked from commit d6717f877872e62d9df1e0ce2d8856620c993924)
---
 sys/amd64/amd64/machdep.c    |   4 +-
 sys/amd64/amd64/mp_machdep.c | 187 ++++++++++++++++---------------------------
 sys/amd64/amd64/mpboot.S     |  64 +++++++--------
 sys/amd64/include/smp.h      |   3 +-
 sys/x86/x86/mp_x86.c         |   5 --
 sys/x86/xen/pv.c             |   1 -
 6 files changed, 96 insertions(+), 168 deletions(-)

diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 93030cbe7126..840570be534a 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -187,7 +187,6 @@ struct init_ops init_ops = {
        .early_delay =                  i8254_delay,
        .parse_memmap =                 native_parse_memmap,
 #ifdef SMP
-       .mp_bootaddress =               mp_bootaddress,
        .start_all_aps =                native_start_all_aps,
 #endif
 #ifdef DEV_PCI
@@ -1288,8 +1287,7 @@ getmemsize(caddr_t kmdp, u_int64_t first)
         * is configured to support APs and APs for the system start
         * in real mode mode (e.g. SMP bare metal).
         */
-       if (init_ops.mp_bootaddress)
-               init_ops.mp_bootaddress(physmap, &physmap_idx);
+       alloc_ap_trampoline(physmap, &physmap_idx);
 
        /* call pmap initialization to make new kernel address space */
        pmap_bootstrap(&first);
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index d1064262891f..082a58ada48f 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -106,6 +106,7 @@ char *dbg_stack;
 void *bootpcpu;
 
 extern u_int mptramp_la57;
+extern u_int mptramp_nx;
 
 /*
  * Local data and functions.
@@ -113,86 +114,6 @@ extern u_int mptramp_la57;
 
 static int     start_ap(int apic_id);
 
-static bool
-is_kernel_paddr(vm_paddr_t pa)
-{
-
-       return (pa >= trunc_2mpage(btext - KERNBASE) &&
-          pa < round_page(_end - KERNBASE));
-}
-
-static bool
-is_mpboot_good(vm_paddr_t start, vm_paddr_t end)
-{
-
-       return (start + AP_BOOTPT_SZ <= GiB(4) && atop(end) < Maxmem);
-}
-
-/*
- * Calculate usable address in base memory for AP trampoline code.
- */
-void
-mp_bootaddress(vm_paddr_t *physmap, unsigned int *physmap_idx)
-{
-       vm_paddr_t start, end;
-       unsigned int i;
-       bool allocated;
-
-       alloc_ap_trampoline(physmap, physmap_idx);
-
-       /*
-        * Find a memory region big enough below the 4GB boundary to
-        * store the initial page tables.  Region must be mapped by
-        * the direct map.
-        *
-        * Note that it needs to be aligned to a page boundary.
-        */
-       allocated = false;
-       for (i = *physmap_idx; i <= *physmap_idx; i -= 2) {
-               /*
-                * First, try to chomp at the start of the physmap region.
-                * Kernel binary might claim it already.
-                */
-               start = round_page(physmap[i]);
-               end = start + AP_BOOTPT_SZ;
-               if (start < end && end <= physmap[i + 1] &&
-                   is_mpboot_good(start, end) &&
-                   !is_kernel_paddr(start) && !is_kernel_paddr(end - 1)) {
-                       allocated = true;
-                       physmap[i] = end;
-                       break;
-               }
-
-               /*
-                * Second, try to chomp at the end.  Again, check
-                * against kernel.
-                */
-               end = trunc_page(physmap[i + 1]);
-               start = end - AP_BOOTPT_SZ;
-               if (start < end && start >= physmap[i] &&
-                   is_mpboot_good(start, end) &&
-                   !is_kernel_paddr(start) && !is_kernel_paddr(end - 1)) {
-                       allocated = true;
-                       physmap[i + 1] = start;
-                       break;
-               }
-       }
-       if (allocated) {
-               mptramp_pagetables = start;
-               if (physmap[i] == physmap[i + 1] && *physmap_idx != 0) {
-                       memmove(&physmap[i], &physmap[i + 2],
-                           sizeof(*physmap) * (*physmap_idx - i + 2));
-                       *physmap_idx -= 2;
-               }
-       } else {
-               mptramp_pagetables = trunc_page(boot_address) - AP_BOOTPT_SZ;
-               if (bootverbose)
-                       printf(
-"Cannot find enough space for the initial AP page tables, placing them at %#x",
-                           mptramp_pagetables);
-       }
-}
-
 /*
  * Initialize the IPI handlers and start up the AP's.
  */
@@ -244,6 +165,9 @@ cpu_mp_start(void)
        assign_cpu_ids();
 
        mptramp_la57 = la57;
+       mptramp_nx = pg_nx != 0;
+       MPASS(kernel_pmap->pm_cr3 < (1UL << 32));
+       mptramp_pagetables = kernel_pmap->pm_cr3;
 
        /* Start each Application Processor */
        init_ops.start_all_aps();
@@ -398,55 +322,67 @@ mp_realloc_pcpu(int cpuid, int domain)
 int
 native_start_all_aps(void)
 {
-       u_int64_t *pt5, *pt4, *pt3, *pt2;
+       vm_page_t m_pml4, m_pdp, m_pd[4];
+       pml5_entry_t old_pml45;
+       pml4_entry_t *v_pml4;
+       pdp_entry_t *v_pdp;
+       pd_entry_t *v_pd;
        u_int32_t mpbioswarmvec;
-       int apic_id, cpu, domain, i, xo;
+       int apic_id, cpu, domain, i;
        u_char mpbiosreason;
 
        mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
 
-       /* copy the AP 1st level boot code */
-       bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size);
-
-       /* Locate the page tables, they'll be below the trampoline */
+       /* Create a transient 1:1 mapping of low 4G */
        if (la57) {
-               pt5 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables);
-               xo = 1;
+               m_pml4 = pmap_page_alloc_below_4g(true);
+               v_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4));
        } else {
-               xo = 0;
+               v_pml4 = &kernel_pmap->pm_pmltop[0];
        }
-       pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables + xo * PAGE_SIZE);
-       pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t);
-       pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t);
-
-       /* Create the initial 1GB replicated page tables */
-       for (i = 0; i < 512; i++) {
-               if (la57) {
-                       pt5[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
-                           PAGE_SIZE);
-                       pt5[i] |= PG_V | PG_RW | PG_U;
-               }
-
-               /*
-                * Each slot of the level 4 pages points to the same
-                * level 3 page.
-                */
-               pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
-                   (xo + 1) * PAGE_SIZE);
-               pt4[i] |= PG_V | PG_RW | PG_U;
-
-               /*
-                * Each slot of the level 3 pages points to the same
-                * level 2 page.
-                */
-               pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
-                   ((xo + 2) * PAGE_SIZE));
-               pt3[i] |= PG_V | PG_RW | PG_U;
-
-               /* The level 2 page slots are mapped with 2MB pages for 1GB. */
-               pt2[i] = i * (2 * 1024 * 1024);
-               pt2[i] |= PG_V | PG_RW | PG_PS | PG_U;
+       m_pdp = pmap_page_alloc_below_4g(true);
+       v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp));
+       m_pd[0] = pmap_page_alloc_below_4g(false);
+       v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[0]));
+       for (i = 0; i < NPDEPG; i++)
+               v_pd[i] = (i << PDRSHIFT) | X86_PG_V | X86_PG_RW | X86_PG_A |
+                   X86_PG_M | PG_PS;
+       m_pd[1] = pmap_page_alloc_below_4g(false);
+       v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[1]));
+       for (i = 0; i < NPDEPG; i++)
+               v_pd[i] = (NBPDP + (i << PDRSHIFT)) | X86_PG_V | X86_PG_RW |
+                   X86_PG_A | X86_PG_M | PG_PS;
+       m_pd[2] = pmap_page_alloc_below_4g(false);
+       v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[2]));
+       for (i = 0; i < NPDEPG; i++)
+               v_pd[i] = (2UL * NBPDP + (i << PDRSHIFT)) | X86_PG_V |
+                   X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS;
+       m_pd[3] = pmap_page_alloc_below_4g(false);
+       v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[3]));
+       for (i = 0; i < NPDEPG; i++)
+               v_pd[i] = (3UL * NBPDP + (i << PDRSHIFT)) | X86_PG_V |
+                   X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS;
+       v_pdp[0] = VM_PAGE_TO_PHYS(m_pd[0]) | X86_PG_V |
+           X86_PG_RW | X86_PG_A | X86_PG_M;
+       v_pdp[1] = VM_PAGE_TO_PHYS(m_pd[1]) | X86_PG_V |
+           X86_PG_RW | X86_PG_A | X86_PG_M;
+       v_pdp[2] = VM_PAGE_TO_PHYS(m_pd[2]) | X86_PG_V |
+           X86_PG_RW | X86_PG_A | X86_PG_M;
+       v_pdp[3] = VM_PAGE_TO_PHYS(m_pd[3]) | X86_PG_V |
+           X86_PG_RW | X86_PG_A | X86_PG_M;
+       old_pml45 = kernel_pmap->pm_pmltop[0];
+       if (la57) {
+               kernel_pmap->pm_pmltop[0] = VM_PAGE_TO_PHYS(m_pml4) |
+                   X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
        }
+       v_pml4[0] = VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V |
+           X86_PG_RW | X86_PG_A | X86_PG_M;
+       pmap_invalidate_all(kernel_pmap);
+
+       /* copy the AP 1st level boot code */
+       bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size);
+       if (bootverbose)
+               printf("AP boot address %#x\n", boot_address);
 
        /* save the current value of the warm-start vector */
        if (!efi_boot)
@@ -517,6 +453,17 @@ native_start_all_aps(void)
        outb(CMOS_REG, BIOS_RESET);
        outb(CMOS_DATA, mpbiosreason);
 
+       /* Destroy transient 1:1 mapping */
+       kernel_pmap->pm_pmltop[0] = old_pml45;
+       invlpg(0);
+       if (la57)
+               vm_page_free(m_pml4);
+       vm_page_free(m_pd[3]);
+       vm_page_free(m_pd[2]);
+       vm_page_free(m_pd[1]);
+       vm_page_free(m_pd[0]);
+       vm_page_free(m_pdp);
+
        /* number of APs actually started */
        return (mp_naps);
 }
diff --git a/sys/amd64/amd64/mpboot.S b/sys/amd64/amd64/mpboot.S
index afdcffa573a4..1b5657d3bef8 100644
--- a/sys/amd64/amd64/mpboot.S
+++ b/sys/amd64/amd64/mpboot.S
@@ -95,12 +95,25 @@ protmode:
         * is later enabled.
         */
        mov     %cr4, %eax
-       orl     $CR4_PAE, %eax
+       orl     $(CR4_PAE | CR4_PGE), %eax
        cmpb    $0, mptramp_la57-mptramp_start(%ebx)
        je      1f
        orl     $CR4_LA57, %eax
 1:     mov     %eax, %cr4
 
+       /*
+        * If the BSP reported NXE support, enable EFER.NXE for all APs
+        * prior to loading %cr3. This avoids page faults if the AP
+        * encounters memory marked with the NX bit prior to detecting and
+        * enabling NXE support.
+        */
+       cmpb    $0,mptramp_nx-mptramp_start(%ebx)
+       je      2f
+       movl    $MSR_EFER, %ecx
+       rdmsr
+       orl     $EFER_NXE, %eax
+       wrmsr
+2:
        /*
         * Enable EFER.LME so that we get long mode when all the prereqs are
         * in place.  In this case, it turns on when CR0_PG is finally enabled.
@@ -112,12 +125,13 @@ protmode:
        wrmsr
 
        /*
-        * Point to the embedded page tables for startup.  Note that this
-        * only gets accessed after we're actually in 64 bit mode, however
-        * we can only set the bottom 32 bits of %cr3 in this state.  This
-        * means we are required to use a temporary page table that is below
-        * the 4GB limit.  %ebx is still our relocation base.  We could just
-        * subtract 3 * PAGE_SIZE, but that would be too easy.
+        * Load kernel page table pointer into %cr3.
+        * %ebx is still our relocation base.
+        *
+        * Note that this only gets accessed after we're actually in 64 bit
+        * mode, however we can only set the bottom 32 bits of %cr3 in this
+        * state.  This means we depend on the kernel page table being
+        * allocated from the low 4G.
         */
        leal    mptramp_pagetables-mptramp_start(%ebx),%eax
        movl    (%eax), %eax
@@ -155,10 +169,8 @@ jmp_64:
        /*
         * Yeehar!  We're running in 64 bit mode!  We can mostly ignore our
         * segment registers, and get on with it.
-        * Note that we are running at the correct virtual address, but with
-        * a 1:1 1GB mirrored mapping over entire address space.  We had better
-        * switch to a real %cr3 promptly so that we can get to the direct map
-        * space. Remember that jmp is relative and that we've been relocated,
+        * We are running at the correct virtual address space.
+        * Note that the jmp is relative and that we've been relocated,
         * so use an indirect jump.
         */
        .code64
@@ -220,6 +232,10 @@ mptramp_pagetables:
 mptramp_la57:
        .long   0
 
+       .globl  mptramp_nx
+mptramp_nx:
+       .long   0
+
        /*
         * The pseudo descriptor for lgdt to use.
         */
@@ -243,32 +259,6 @@ bootMP_size:
        .code64
        .p2align 4,0
 entry_64:
-       /*
-        * If the BSP reported NXE support, enable EFER.NXE for all APs
-        * prior to loading %cr3. This avoids page faults if the AP
-        * encounters memory marked with the NX bit prior to detecting and
-        * enabling NXE support.
-        */
-       movq    pg_nx, %rbx
-       testq   %rbx, %rbx
-       je      1f
-       movl    $MSR_EFER, %ecx
-       rdmsr
-       orl     $EFER_NXE, %eax
-       wrmsr
-
-1:
-       /*
-        * Load a real %cr3 that has all the direct map stuff and switches
-        * off the 1GB replicated mirror.  Load a stack pointer and jump
-        * into AP startup code in C.
-       */
-       cmpl    $0, la57
-       jne     2f
-       movq    KPML4phys, %rax
-       jmp     3f
-2:     movq    KPML5phys, %rax
-3:     movq    %rax, %cr3
        movq    bootSTK, %rsp
 
        /*
diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h
index 8fbd89da0e57..84ee73cef723 100644
--- a/sys/amd64/include/smp.h
+++ b/sys/amd64/include/smp.h
@@ -38,8 +38,7 @@ inthand_t
        IDTVEC(rendezvous_pti);
 
 void   invlop_handler(void);
-int    native_start_all_aps(void);
-void   mp_bootaddress(vm_paddr_t *, unsigned int *);
+int native_start_all_aps(void);
 
 #endif /* !LOCORE */
 #endif /* SMP */
diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c
index f1c1e45e79b8..441a766f87fb 100644
--- a/sys/x86/x86/mp_x86.c
+++ b/sys/x86/x86/mp_x86.c
@@ -1070,11 +1070,6 @@ init_secondary_tail(void)
        }
 
 #ifdef __amd64__
-       /*
-        * Enable global pages TLB extension
-        * This also implicitly flushes the TLB 
-        */
-       load_cr4(rcr4() | CR4_PGE);
        if (pmap_pcid_enabled)
                load_cr4(rcr4() | CR4_PCIDE);
        load_ds(_udatasel);
diff --git a/sys/x86/xen/pv.c b/sys/x86/xen/pv.c
index 2fd698772f9d..59c5b464aace 100644
--- a/sys/x86/xen/pv.c
+++ b/sys/x86/xen/pv.c
@@ -134,7 +134,6 @@ struct init_ops xen_pvh_init_ops = {
        .early_delay                    = xen_delay,
        .parse_memmap                   = xen_pvh_parse_memmap,
 #ifdef SMP
-       .mp_bootaddress                 = mp_bootaddress,
        .start_all_aps                  = native_start_all_aps,
 #endif
        .msi_init                       = msi_init,
_______________________________________________
[email protected] mailing list
https://lists.freebsd.org/mailman/listinfo/dev-commits-src-all
To unsubscribe, send any mail to "[email protected]"

Reply via email to