The branch stable/13 has been updated by kib:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=8ca493ffb44691e70ae92300b8de1c1d30134ef4

commit 8ca493ffb44691e70ae92300b8de1c1d30134ef4
Author:     Konstantin Belousov <[email protected]>
AuthorDate: 2021-07-10 19:48:02 +0000
Commit:     Konstantin Belousov <[email protected]>
CommitDate: 2021-08-23 23:21:13 +0000

    amd64: do not assume that kernel is loaded at 2M physical
    
    (cherry picked from commit e18380e341410ce70d97560a22827591f4b2d373)
---
 sys/amd64/amd64/machdep.c   | 38 ++++++++++++++++++++++--
 sys/amd64/amd64/pmap.c      | 72 +++++++++++++++++++++++++++------------------
 sys/amd64/include/md_var.h  |  7 ++---
 sys/amd64/include/vmparam.h | 16 ++++++++--
 sys/conf/ldscript.amd64     |  5 ++--
 5 files changed, 96 insertions(+), 42 deletions(-)

diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 63f933ad535c..2c8711fd3d2a 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -1599,7 +1599,10 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
        int gsel_tss, x;
        struct pcpu *pc;
        struct xstate_hdr *xhdr;
-       u_int64_t rsp0;
+       uint64_t cr3, rsp0;
+       pml4_entry_t *pml4e;
+       pdp_entry_t *pdpe;
+       pd_entry_t *pde;
        char *env;
        struct user_segment_descriptor *gdt;
        struct region_descriptor r_gdt;
@@ -1608,6 +1611,35 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
 
        TSRAW(&thread0, TS_ENTER, __func__, NULL);
 
+       /*
+        * Calculate kernphys by inspecting page table created by loader.
+        * The assumptions:
+        * - kernel is mapped at KERNBASE, backed by contiguous phys memory
+        *   aligned at 2M, below 4G (the latter is important for AP startup)
+        * - there is a 2M hole at KERNBASE
+        * - kernel is mapped with 2M superpages
+        * - all participating memory, i.e. kernel, modules, metadata,
+        *   page table is accessible by pre-created 1:1 mapping
+        *   (right now loader creates 1:1 mapping for lower 4G, and all
+        *   memory is from there)
+        * - there is a usable memory block right after the end of the
+        *   mapped kernel and all modules/metadata, pointed to by
+        *   physfree, for early allocations
+        */
+       cr3 = rcr3();
+       pml4e = (pml4_entry_t *)(cr3 & ~PAGE_MASK) + pmap_pml4e_index(
+           (vm_offset_t)hammer_time);
+       pdpe = (pdp_entry_t *)(*pml4e & ~PAGE_MASK) + pmap_pdpe_index(
+           (vm_offset_t)hammer_time);
+       pde = (pd_entry_t *)(*pdpe & ~PAGE_MASK) + pmap_pde_index(
+           (vm_offset_t)hammer_time);
+       kernphys = (vm_paddr_t)(*pde & ~PDRMASK) -
+           (vm_paddr_t)(((vm_offset_t)hammer_time - KERNBASE) & ~PDRMASK);
+
+       /* Fix-up for 2M hole */
+       physfree += kernphys;
+       kernphys += NBPDR;
+
        kmdp = init_ops.parse_preload_data(modulep);
 
        efi_boot = preload_search_info(kmdp, MODINFO_METADATA |
@@ -1653,7 +1685,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
        /* Init basic tunables, hz etc */
        init_param1();
 
-       thread0.td_kstack = physfree + KERNBASE;
+       thread0.td_kstack = physfree - kernphys + KERNSTART;
        thread0.td_kstack_pages = kstack_pages;
        kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
        bzero((void *)thread0.td_kstack, kstack0_sz);
@@ -1690,7 +1722,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
        wrmsr(MSR_GSBASE, (u_int64_t)pc);
        wrmsr(MSR_KGSBASE, 0);          /* User value while in the kernel */
 
-       dpcpu_init((void *)(physfree + KERNBASE), 0);
+       dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
        physfree += DPCPU_SIZE;
        amd64_bsp_pcpu_init1(pc);
        /* Non-late cninit() and printf() can be moved up to here. */
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index e5d46449c275..d35422924b1f 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -429,7 +429,8 @@ static u_int64_t    DMPDphys;       /* phys addr of direct 
mapped level 2 */
 static u_int64_t       DMPDPphys;      /* phys addr of direct mapped level 3 */
 static int             ndmpdpphys;     /* number of DMPDPphys pages */
 
-static vm_paddr_t      KERNend;        /* phys addr of end of bootstrap data */
+vm_paddr_t             kernphys;       /* phys addr of start of bootstrap data 
*/
+vm_paddr_t             KERNend;        /* and the end */
 
 /*
  * pmap_mapdev support pre initialization (i.e. console)
@@ -1532,7 +1533,7 @@ nkpt_init(vm_paddr_t addr)
 #ifdef NKPT
        pt_pages = NKPT;
 #else
-       pt_pages = howmany(addr, NBPDR);
+       pt_pages = howmany(addr - kernphys, NBPDR) + 1; /* +1 for 2M hole @0 */
        pt_pages += NKPDPE(pt_pages);
 
        /*
@@ -1572,7 +1573,6 @@ nkpt_init(vm_paddr_t addr)
 static inline pt_entry_t
 bootaddr_rwx(vm_paddr_t pa)
 {
-
        /*
         * The kernel is loaded at a 2MB-aligned address, and memory below that
         * need not be executable.  The .bss section is padded to a 2MB
@@ -1580,8 +1580,8 @@ bootaddr_rwx(vm_paddr_t pa)
         * either.  Preloaded kernel modules have their mapping permissions
         * fixed up by the linker.
         */
-       if (pa < trunc_2mpage(btext - KERNBASE) ||
-           pa >= trunc_2mpage(_end - KERNBASE))
+       if (pa < trunc_2mpage(kernphys + btext - KERNSTART) ||
+           pa >= trunc_2mpage(kernphys + _end - KERNSTART))
                return (X86_PG_RW | pg_nx);
 
        /*
@@ -1590,7 +1590,7 @@ bootaddr_rwx(vm_paddr_t pa)
         * impact read-only data. However, in any case, any page with
         * read-write data needs to be read-write.
         */
-       if (pa >= trunc_2mpage(brwsection - KERNBASE))
+       if (pa >= trunc_2mpage(kernphys + brwsection - KERNSTART))
                return (X86_PG_RW | pg_nx);
 
        /*
@@ -1602,7 +1602,7 @@ bootaddr_rwx(vm_paddr_t pa)
         * Note that fixups to the .text section will still work until we
         * set CR0.WP.
         */
-       if (pa < round_2mpage(etext - KERNBASE))
+       if (pa < round_2mpage(kernphys + etext - KERNSTART))
                return (0);
        return (pg_nx);
 }
@@ -1610,11 +1610,12 @@ bootaddr_rwx(vm_paddr_t pa)
 static void
 create_pagetables(vm_paddr_t *firstaddr)
 {
-       int i, j, ndm1g, nkpdpe, nkdmpde;
        pd_entry_t *pd_p;
        pdp_entry_t *pdp_p;
        pml4_entry_t *p4_p;
        uint64_t DMPDkernphys;
+       vm_paddr_t pax;
+       int i, j, ndm1g, nkpdpe, nkdmpde;
 
        /* Allocate page table pages for the direct map */
        ndmpdp = howmany(ptoa(Maxmem), NBPDP);
@@ -1642,9 +1643,11 @@ create_pagetables(vm_paddr_t *firstaddr)
 
                /*
                 * Allocate 2M pages for the kernel. These will be used in
-                * place of the first one or more 1G pages from ndm1g.
+                * place of the one or more 1G pages from ndm1g that maps
+                * kernel memory into DMAP.
                 */
-               nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), NBPDP);
+               nkdmpde = howmany((vm_offset_t)brwsection - KERNSTART +
+                   kernphys - rounddown2(kernphys, NBPDP), NBPDP);
                DMPDkernphys = allocpages(firstaddr, nkdmpde);
        }
        if (ndm1g < ndmpdp)
@@ -1681,14 +1684,18 @@ create_pagetables(vm_paddr_t *firstaddr)
                pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
 
        /*
-        * Map from physical address zero to the end of loader preallocated
-        * memory using 2MB pages.  This replaces some of the PD entries
-        * created above.
+        * Map from start of the kernel in physical memory (staging
+        * area) to the end of loader preallocated memory using 2MB
+        * pages.  This replaces some of the PD entries created above.
+        * For compatibility, identity map 2M at the start.
         */
-       for (i = 0; (i << PDRSHIFT) < KERNend; i++)
+       pd_p[0] = X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A |
+           X86_PG_RW | pg_nx;
+       for (i = 1, pax = kernphys; pax < KERNend; i++, pax += NBPDR) {
                /* Preset PG_M and PG_A because demotion expects it. */
-               pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
-                   X86_PG_M | X86_PG_A | bootaddr_rwx(i << PDRSHIFT);
+               pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M |
+                   X86_PG_A | bootaddr_rwx(pax);
+       }
 
        /*
         * Because we map the physical blocks in 2M pages, adjust firstaddr
@@ -1735,15 +1742,18 @@ create_pagetables(vm_paddr_t *firstaddr)
         * use 2M pages with read-only and no-execute permissions.  (If using 1G
         * pages, this will partially overwrite the PDPEs above.)
         */
-       if (ndm1g) {
+       if (ndm1g > 0) {
                pd_p = (pd_entry_t *)DMPDkernphys;
-               for (i = 0; i < (NPDEPG * nkdmpde); i++)
-                       pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
-                           X86_PG_M | X86_PG_A | pg_nx |
-                           bootaddr_rwx(i << PDRSHIFT);
-               for (i = 0; i < nkdmpde; i++)
-                       pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW |
-                           X86_PG_V | pg_nx;
+               for (i = 0, pax = rounddown2(kernphys, NBPDP);
+                   i < NPDEPG * nkdmpde; i++, pax += NBPDR) {
+                       pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M |
+                           X86_PG_A | pg_nx | bootaddr_rwx(pax);
+               }
+               j = rounddown2(kernphys, NBPDP) >> PDPSHIFT;
+               for (i = 0; i < nkdmpde; i++) {
+                       pdp_p[i + j] = (DMPDkernphys + ptoa(i)) |
+                           X86_PG_RW | X86_PG_V | pg_nx;
+               }
        }
 
        /* And recursively map PML4 to itself in order to get PTmap */
@@ -1811,7 +1821,8 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
        /*
         * Account for the virtual addresses mapped by create_pagetables().
         */
-       virtual_avail = (vm_offset_t)KERNBASE + round_2mpage(KERNend);
+       virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend -
+           (vm_paddr_t)kernphys);
        virtual_end = VM_MAX_KERNEL_ADDRESS;
 
        /*
@@ -2348,7 +2359,8 @@ pmap_init(void)
                 * Collect the page table pages that were replaced by a 2MB
                 * page in create_pagetables().  They are zero filled.
                 */
-               if ((vm_paddr_t)i << PDRSHIFT < KERNend &&
+               if ((i == 0 ||
+                   kernphys + ((vm_paddr_t)(i - 1) << PDRSHIFT) < KERNend) &&
                    pmap_insert_pt_page(kernel_pmap, mpte, false))
                        panic("pmap_init: pmap_insert_pt_page failed");
        }
@@ -6567,7 +6579,9 @@ setpte:
            mpte < &vm_page_array[vm_page_array_size],
            ("pmap_promote_pde: page table page is out of range"));
        KASSERT(mpte->pindex == pmap_pde_pindex(va),
-           ("pmap_promote_pde: page table page's pindex is wrong"));
+           ("pmap_promote_pde: page table page's pindex is wrong "
+           "mpte %p pidx %#lx va %#lx va pde pidx %#lx",
+           mpte, mpte->pindex, va, pmap_pde_pindex(va)));
        if (pmap_insert_pt_page(pmap, mpte, true)) {
                atomic_add_long(&pmap_pde_p_failures, 1);
                CTR2(KTR_PMAP,
@@ -10625,8 +10639,8 @@ pmap_pti_init(void)
                va = __pcpu[i].pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu);
                pmap_pti_add_kva_locked(va - DBG_STACK_SIZE, va, false);
        }
-       pmap_pti_add_kva_locked((vm_offset_t)KERNBASE + NBPDR,
-           (vm_offset_t)etext, true);
+       pmap_pti_add_kva_locked((vm_offset_t)KERNSTART, (vm_offset_t)etext,
+           true);
        pti_finalized = true;
        VM_OBJECT_WUNLOCK(pti_obj);
 }
diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h
index b66e314d99b1..53139711bbff 100644
--- a/sys/amd64/include/md_var.h
+++ b/sys/amd64/include/md_var.h
@@ -49,11 +49,8 @@ extern vm_paddr_t intel_graphics_stolen_size;
 
 extern int la57;
 
-/*
- * The file "conf/ldscript.amd64" defines the symbol "kernphys".  Its
- * value is the physical address at which the kernel is loaded.
- */
-extern char kernphys[];
+extern vm_paddr_t kernphys;
+extern vm_paddr_t KERNend;
 
 extern bool efi_boot;
 
diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h
index c7ffb218dd4a..b6f79ef8ca84 100644
--- a/sys/amd64/include/vmparam.h
+++ b/sys/amd64/include/vmparam.h
@@ -149,8 +149,10 @@
 #endif
 
 /*
- * Kernel physical load address. Needs to be aligned at 2MB superpage
- * boundary.
+ * Kernel physical load address for non-UEFI boot and for legacy UEFI loader.
+ * Newer UEFI loader loads kernel anywhere below 4G, with memory allocated
+ * by boot services.
+ * Needs to be aligned at 2MB superpage boundary.
  */
 #ifndef KERNLOAD
 #define        KERNLOAD        0x200000
@@ -186,7 +188,17 @@
 #define        LARGEMAP_MIN_ADDRESS    KV4ADDR(LMSPML4I, 0, 0, 0)
 #define        LARGEMAP_MAX_ADDRESS    KV4ADDR(LMEPML4I + 1, 0, 0, 0)
 
+/*
+ * Formally kernel mapping starts at KERNBASE, but kernel linker
+ * script leaves first PDE reserved.  For legacy BIOS boot, kernel is
+ * loaded at KERNLOAD = 2M, and initial kernel page table maps
+ * physical memory from zero to KERNend starting at KERNBASE.
+ *
+ * KERNSTART is where the first actual kernel page is mapped, after
+ * the compatibility mapping.
+ */
 #define        KERNBASE                KV4ADDR(KPML4I, KPDPI, 0, 0)
+#define        KERNSTART               (KERNBASE + NBPDR)
 
 #define        UPT_MAX_ADDRESS         KV4ADDR(PML4PML4I, PML4PML4I, 
PML4PML4I, PML4PML4I)
 #define        UPT_MIN_ADDRESS         KV4ADDR(PML4PML4I, 0, 0, 0)
diff --git a/sys/conf/ldscript.amd64 b/sys/conf/ldscript.amd64
index c11ffb6ea49f..68085ff7435c 100644
--- a/sys/conf/ldscript.amd64
+++ b/sys/conf/ldscript.amd64
@@ -5,15 +5,14 @@ ENTRY(btext)
 SEARCH_DIR("/usr/lib");
 SECTIONS
 {
-  kernphys = kernload;
   /* Read-only sections, merged into text segment: */
-  . = kernbase + kernphys + SIZEOF_HEADERS;
+  . = kernbase + kernload + SIZEOF_HEADERS;
   /*
    * Use the AT keyword in order to set the right LMA that contains
    * the physical address where the section should be loaded. This is
    * needed for the Xen loader which honours the LMA.
    */
-  .interp         : AT (kernphys + SIZEOF_HEADERS) { *(.interp) }
+  .interp         : AT (kernload + SIZEOF_HEADERS) { *(.interp) }
   .hash           : { *(.hash) }
   .gnu.hash       : { *(.gnu.hash) }
   .dynsym         : { *(.dynsym) }
_______________________________________________
[email protected] mailing list
https://lists.freebsd.org/mailman/listinfo/dev-commits-src-all
To unsubscribe, send any mail to "[email protected]"

Reply via email to