Author: markj
Date: Tue Jun  4 17:31:05 2019
New Revision: 348644
URL: https://svnweb.freebsd.org/changeset/base/348644

Log:
  MFC r344106:
  Implement transparent 2MB superpage promotion for RISC-V.

Modified:
  stable/12/sys/riscv/include/param.h
  stable/12/sys/riscv/include/pmap.h
  stable/12/sys/riscv/include/pte.h
  stable/12/sys/riscv/include/vmparam.h
  stable/12/sys/riscv/riscv/pmap.c
  stable/12/sys/vm/vm_fault.c
Directory Properties:
  stable/12/   (props changed)

Modified: stable/12/sys/riscv/include/param.h
==============================================================================
--- stable/12/sys/riscv/include/param.h Tue Jun  4 17:30:22 2019        
(r348643)
+++ stable/12/sys/riscv/include/param.h Tue Jun  4 17:31:05 2019        
(r348644)
@@ -82,7 +82,7 @@
 #define        PAGE_SIZE       (1 << PAGE_SHIFT)       /* Page size */
 #define        PAGE_MASK       (PAGE_SIZE - 1)
 
-#define        MAXPAGESIZES    1               /* maximum number of supported 
page sizes */
+#define        MAXPAGESIZES    3       /* maximum number of supported page 
sizes */
 
 #ifndef KSTACK_PAGES
 #define        KSTACK_PAGES    4       /* pages of kernel stack (with pcb) */

Modified: stable/12/sys/riscv/include/pmap.h
==============================================================================
--- stable/12/sys/riscv/include/pmap.h  Tue Jun  4 17:30:22 2019        
(r348643)
+++ stable/12/sys/riscv/include/pmap.h  Tue Jun  4 17:31:05 2019        
(r348644)
@@ -44,6 +44,8 @@
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 
+#include <vm/_vm_radix.h>
+
 #ifdef _KERNEL
 
 #define        vtophys(va)     pmap_kextract((vm_offset_t)(va))
@@ -80,6 +82,7 @@ struct pmap {
        pd_entry_t              *pm_l1;
        TAILQ_HEAD(,pv_chunk)   pm_pvchunk;     /* list of mappings in pmap */
        LIST_ENTRY(pmap)        pm_list;        /* List of all pmaps */
+       struct vm_radix         pm_root;
 };
 
 typedef struct pv_entry {
@@ -139,6 +142,7 @@ void        pmap_kenter_device(vm_offset_t, vm_size_t, 
vm_pad
 vm_paddr_t pmap_kextract(vm_offset_t va);
 void   pmap_kremove(vm_offset_t);
 void   pmap_kremove_device(vm_offset_t, vm_size_t);
+bool   pmap_ps_enabled(pmap_t);
 
 void   *pmap_mapdev(vm_offset_t, vm_size_t);
 void   *pmap_mapbios(vm_paddr_t, vm_size_t);

Modified: stable/12/sys/riscv/include/pte.h
==============================================================================
--- stable/12/sys/riscv/include/pte.h   Tue Jun  4 17:30:22 2019        
(r348643)
+++ stable/12/sys/riscv/include/pte.h   Tue Jun  4 17:31:05 2019        
(r348644)
@@ -62,7 +62,8 @@ typedef       uint64_t        pn_t;                   /* page 
number */
 #define        L3_SIZE         (1 << L3_SHIFT)
 #define        L3_OFFSET       (L3_SIZE - 1)
 
-#define        Ln_ENTRIES      (1 << 9)
+#define        Ln_ENTRIES_SHIFT 9
+#define        Ln_ENTRIES      (1 << Ln_ENTRIES_SHIFT)
 #define        Ln_ADDR_MASK    (Ln_ENTRIES - 1)
 
 /* Bits 9:8 are reserved for software */
@@ -79,6 +80,8 @@ typedef       uint64_t        pn_t;                   /* page 
number */
 #define        PTE_RWX         (PTE_R | PTE_W | PTE_X)
 #define        PTE_RX          (PTE_R | PTE_X)
 #define        PTE_KERN        (PTE_V | PTE_R | PTE_W | PTE_A | PTE_D)
+#define        PTE_PROMOTE     (PTE_V | PTE_RWX | PTE_D | PTE_A | PTE_G | 
PTE_U | \
+                        PTE_SW_MANAGED | PTE_SW_WIRED)
 
 #define        PTE_PPN0_S      10
 #define        PTE_PPN1_S      19

Modified: stable/12/sys/riscv/include/vmparam.h
==============================================================================
--- stable/12/sys/riscv/include/vmparam.h       Tue Jun  4 17:30:22 2019        
(r348643)
+++ stable/12/sys/riscv/include/vmparam.h       Tue Jun  4 17:31:05 2019        
(r348644)
@@ -99,10 +99,10 @@
 #define        VM_NFREEORDER           12
 
 /*
- * Disable superpage reservations.
+ * Enable superpage reservations: 1 level.
  */
 #ifndef        VM_NRESERVLEVEL
-#define        VM_NRESERVLEVEL         0
+#define        VM_NRESERVLEVEL         1
 #endif
 
 /*

Modified: stable/12/sys/riscv/riscv/pmap.c
==============================================================================
--- stable/12/sys/riscv/riscv/pmap.c    Tue Jun  4 17:30:22 2019        
(r348643)
+++ stable/12/sys/riscv/riscv/pmap.c    Tue Jun  4 17:31:05 2019        
(r348644)
@@ -118,6 +118,7 @@ __FBSDID("$FreeBSD$");
  */
 
 #include <sys/param.h>
+#include <sys/bitstring.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
@@ -145,6 +146,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
+#include <vm/vm_phys.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
@@ -154,9 +156,8 @@ __FBSDID("$FreeBSD$");
 #include <machine/pcb.h>
 #include <machine/sbi.h>
 
-#define        NPDEPG          (PAGE_SIZE/(sizeof (pd_entry_t)))
-#define        NUPDE                   (NPDEPG * NPDEPG)
-#define        NUSERPGTBLS             (NUPDE + NPDEPG)
+#define        NUL1E           (Ln_ENTRIES * Ln_ENTRIES)
+#define        NUL2E           (Ln_ENTRIES * NUL1E)
 
 #if !defined(DIAGNOSTIC)
 #ifdef __GNUC_GNU_INLINE__
@@ -175,11 +176,12 @@ __FBSDID("$FreeBSD$");
 #endif
 
 #define        pmap_l2_pindex(v)       ((v) >> L2_SHIFT)
+#define        pa_to_pvh(pa)           (&pv_table[pa_index(pa)])
 
 #define        NPV_LIST_LOCKS  MAXCPU
 
 #define        PHYS_TO_PV_LIST_LOCK(pa)        \
-                       (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
+                       (&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS])
 
 #define        CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)  do {    \
        struct rwlock **_lockp = (lockp);               \
@@ -230,13 +232,52 @@ CTASSERT((DMAP_MAX_ADDRESS  & ~L1_OFFSET) == DMAP_MAX_
 static struct rwlock_padalign pvh_global_lock;
 static struct mtx_padalign allpmaps_lock;
 
+static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0,
+    "VM/pmap parameters");
+
+static int superpages_enabled = 1;
+SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
+    CTLFLAG_RDTUN, &superpages_enabled, 0,
+    "Enable support for transparent superpages");
+
+static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD, 0,
+    "2MB page mapping counters");
+
+static u_long pmap_l2_demotions;
+SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
+    &pmap_l2_demotions, 0,
+    "2MB page demotions");
+
+static u_long pmap_l2_mappings;
+SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
+    &pmap_l2_mappings, 0,
+    "2MB page mappings");
+
+static u_long pmap_l2_p_failures;
+SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
+    &pmap_l2_p_failures, 0,
+    "2MB page promotion failures");
+
+static u_long pmap_l2_promotions;
+SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
+    &pmap_l2_promotions, 0,
+    "2MB page promotions");
+
 /*
  * Data for the pv entry allocation mechanism
  */
 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
 static struct mtx pv_chunks_mutex;
 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
+static struct md_page *pv_table;
+static struct md_page pv_dummy;
 
+/*
+ * Internal flags for pmap_enter()'s helper functions.
+ */
+#define        PMAP_ENTER_NORECLAIM    0x1000000       /* Don't reclaim PV 
entries. */
+#define        PMAP_ENTER_NOREPLACE    0x2000000       /* Don't replace 
mappings. */
+
 static void    free_pv_chunk(struct pv_chunk *pc);
 static void    free_pv_entry(pmap_t pmap, pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
@@ -244,6 +285,11 @@ static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, 
 static void    pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
                    vm_offset_t va);
+static bool    pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va);
+static bool    pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2,
+                   vm_offset_t va, struct rwlock **lockp);
+static int     pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
+                   u_int flags, vm_page_t m, struct rwlock **lockp);
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
@@ -254,9 +300,9 @@ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap,
 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
                struct rwlock **lockp);
 
-static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
+static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
     struct spglist *free);
-static int pmap_unuse_l3(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
+static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
 
 #define        pmap_clear(pte)                 pmap_store(pte, 0)
 #define        pmap_clear_bits(pte, bits)      atomic_clear_64(pte, bits)
@@ -636,7 +682,8 @@ pmap_page_init(vm_page_t m)
 void
 pmap_init(void)
 {
-       int i;
+       vm_size_t s;
+       int i, pv_npg;
 
        /*
         * Initialize the pv chunk and pmap list mutexes.
@@ -649,6 +696,24 @@ pmap_init(void)
         */
        for (i = 0; i < NPV_LIST_LOCKS; i++)
                rw_init(&pv_list_locks[i], "pmap pv list");
+
+       /*
+        * Calculate the size of the pv head table for superpages.
+        */
+       pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE);
+
+       /*
+        * Allocate memory for the pv head table for superpages.
+        */
+       s = (vm_size_t)(pv_npg * sizeof(struct md_page));
+       s = round_page(s);
+       pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
+       for (i = 0; i < pv_npg; i++)
+               TAILQ_INIT(&pv_table[i].pv_list);
+       TAILQ_INIT(&pv_dummy.pv_list);
+
+       if (superpages_enabled)
+               pagesizes[1] = L2_SIZE;
 }
 
 #ifdef SMP
@@ -999,6 +1064,13 @@ pmap_qremove(vm_offset_t sva, int count)
        pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
+bool
+pmap_ps_enabled(pmap_t pmap __unused)
+{
+
+       return (superpages_enabled);
+}
+
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
@@ -1018,6 +1090,34 @@ pmap_add_delayed_free_list(vm_page_t m, struct spglist
                m->flags &= ~PG_ZERO;
        SLIST_INSERT_HEAD(free, m, plinks.s.ss);
 }
+
+/*
+ * Inserts the specified page table page into the specified pmap's collection
+ * of idle page table pages.  Each of a pmap's page table pages is responsible
+ * for mapping a distinct range of virtual addresses.  The pmap's collection is
+ * ordered by this virtual address range.
+ */
+static __inline int
+pmap_insert_pt_page(pmap_t pmap, vm_page_t ml3)
+{
+
+       PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+       return (vm_radix_insert(&pmap->pm_root, ml3));
+}
+
+/*
+ * Removes the page table page mapping the specified virtual address from the
+ * specified pmap's collection of idle page table pages, and returns it.
+ * Otherwise, returns NULL if there is no page table page corresponding to the
+ * specified virtual address.
+ */
+static __inline vm_page_t
+pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
+{
+
+       PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+       return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
+}
        
 /*
  * Decrements a page table page's wire count, which is used to record the
@@ -1026,12 +1126,12 @@ pmap_add_delayed_free_list(vm_page_t m, struct spglist
  * page table page was unmapped and FALSE otherwise.
  */
 static inline boolean_t
-pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
+pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 
        --m->wire_count;
        if (m->wire_count == 0) {
-               _pmap_unwire_l3(pmap, va, m, free);
+               _pmap_unwire_ptp(pmap, va, m, free);
                return (TRUE);
        } else {
                return (FALSE);
@@ -1039,36 +1139,30 @@ pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t 
 }
 
 static void
-_pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
+_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist 
*free)
 {
        vm_paddr_t phys;
 
        PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-       /*
-        * unmap the page table page
-        */
-       if (m->pindex >= NUPDE) {
-               /* PD page */
+       if (m->pindex >= NUL1E) {
                pd_entry_t *l1;
                l1 = pmap_l1(pmap, va);
                pmap_clear(l1);
                pmap_distribute_l1(pmap, pmap_l1_index(va), 0);
        } else {
-               /* PTE page */
                pd_entry_t *l2;
                l2 = pmap_l2(pmap, va);
                pmap_clear(l2);
        }
        pmap_resident_count_dec(pmap, 1);
-       if (m->pindex < NUPDE) {
+       if (m->pindex < NUL1E) {
                pd_entry_t *l1;
-               /* We just released a PT, unhold the matching PD */
                vm_page_t pdpg;
 
                l1 = pmap_l1(pmap, va);
                phys = PTE_TO_PHYS(pmap_load(l1));
                pdpg = PHYS_TO_VM_PAGE(phys);
-               pmap_unwire_l3(pmap, va, pdpg, free);
+               pmap_unwire_ptp(pmap, va, pdpg, free);
        }
        pmap_invalidate_page(pmap, va);
 
@@ -1082,24 +1176,20 @@ _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t
 }
 
 /*
- * After removing an l3 entry, this routine is used to
+ * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
-pmap_unuse_l3(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
+pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
     struct spglist *free)
 {
-       vm_paddr_t phys;
        vm_page_t mpte;
 
        if (va >= VM_MAXUSER_ADDRESS)
                return (0);
        KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
-
-       phys = PTE_TO_PHYS(ptepde);
-
-       mpte = PHYS_TO_VM_PAGE(phys);
-       return (pmap_unwire_l3(pmap, va, mpte, free));
+       mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde));
+       return (pmap_unwire_ptp(pmap, va, mpte, free));
 }
 
 void
@@ -1140,6 +1230,8 @@ pmap_pinit(pmap_t pmap)
        LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
        mtx_unlock(&allpmaps_lock);
 
+       vm_radix_init(&pmap->pm_root);
+
        return (1);
 }
 
@@ -1193,11 +1285,11 @@ _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, str
         * it isn't already there.
         */
 
-       if (ptepindex >= NUPDE) {
+       if (ptepindex >= NUL1E) {
                pd_entry_t *l1;
                vm_pindex_t l1index;
 
-               l1index = ptepindex - NUPDE;
+               l1index = ptepindex - NUL1E;
                l1 = &pmap->pm_l1[l1index];
 
                pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE);
@@ -1213,7 +1305,7 @@ _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, str
                l1 = &pmap->pm_l1[l1index];
                if (pmap_load(l1) == 0) {
                        /* recurse for allocating page dir */
-                       if (_pmap_alloc_l3(pmap, NUPDE + l1index,
+                       if (_pmap_alloc_l3(pmap, NUL1E + l1index,
                            lockp) == NULL) {
                                vm_page_unwire_noq(m);
                                vm_page_free_zero(m);
@@ -1241,6 +1333,29 @@ _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, str
 }
 
 static vm_page_t
+pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
+{
+       pd_entry_t *l1;
+       vm_page_t l2pg;
+       vm_pindex_t l2pindex;
+
+retry:
+       l1 = pmap_l1(pmap, va);
+       if (l1 != NULL && (pmap_load(l1) & PTE_RWX) == 0) {
+               /* Add a reference to the L2 page. */
+               l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1)));
+               l2pg->wire_count++;
+       } else {
+               /* Allocate a L2 page. */
+               l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
+               l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
+               if (l2pg == NULL && lockp != NULL)
+                       goto retry;
+       }
+       return (l2pg);
+}
+
+static vm_page_t
 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
        vm_pindex_t ptepindex;
@@ -1599,6 +1714,79 @@ retry:
 }
 
 /*
+ * Ensure that the number of spare PV entries in the specified pmap meets or
+ * exceeds the given count, "needed".
+ *
+ * The given PV list lock may be released.
+ */
+static void
+reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
+{
+       struct pch new_tail;
+       struct pv_chunk *pc;
+       vm_page_t m;
+       int avail, free;
+       bool reclaimed;
+
+       rw_assert(&pvh_global_lock, RA_LOCKED);
+       PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+       KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
+
+       /*
+        * Newly allocated PV chunks must be stored in a private list until
+        * the required number of PV chunks have been allocated.  Otherwise,
+        * reclaim_pv_chunk() could recycle one of these chunks.  In
+        * contrast, these chunks must be added to the pmap upon allocation.
+        */
+       TAILQ_INIT(&new_tail);
+retry:
+       avail = 0;
+       TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
+               bit_count((bitstr_t *)pc->pc_map, 0,
+                   sizeof(pc->pc_map) * NBBY, &free);
+               if (free == 0)
+                       break;
+               avail += free;
+               if (avail >= needed)
+                       break;
+       }
+       for (reclaimed = false; avail < needed; avail += _NPCPV) {
+               m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
+                   VM_ALLOC_WIRED);
+               if (m == NULL) {
+                       m = reclaim_pv_chunk(pmap, lockp);
+                       if (m == NULL)
+                               goto retry;
+                       reclaimed = true;
+               }
+               /* XXX PV STATS */
+#if 0
+               dump_add_page(m->phys_addr);
+#endif
+               pc = (void *)PHYS_TO_DMAP(m->phys_addr);
+               pc->pc_pmap = pmap;
+               pc->pc_map[0] = PC_FREE0;
+               pc->pc_map[1] = PC_FREE1;
+               pc->pc_map[2] = PC_FREE2;
+               TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
+               TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
+
+               /*
+                * The reclaim might have freed a chunk from the current pmap.
+                * If that chunk contained available entries, we need to
+                * re-count the number of available entries.
+                */
+               if (reclaimed)
+                       goto retry;
+       }
+       if (!TAILQ_EMPTY(&new_tail)) {
+               mtx_lock(&pv_chunks_mutex);
+               TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
+               mtx_unlock(&pv_chunks_mutex);
+       }
+}
+
+/*
  * First find and then remove the pv entry for the specified pmap and virtual
  * address from the specified pv list.  Returns the pv entry if found and NULL
  * otherwise.  This operation can be performed on pv lists for either 4KB or
@@ -1632,7 +1820,7 @@ pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_off
 
        pv = pmap_pvh_remove(pvh, pmap, va);
 
-       KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
+       KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va));
        free_pv_entry(pmap, pv);
 }
 
@@ -1660,6 +1848,222 @@ pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 
 }
 
 /*
+ * After demotion from a 2MB page mapping to 512 4KB page mappings,
+ * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
+ * entries for each of the 4KB page mappings.
+ */
+static void __unused
+pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+    struct rwlock **lockp)
+{
+       struct md_page *pvh;
+       struct pv_chunk *pc;
+       pv_entry_t pv;
+       vm_page_t m;
+       vm_offset_t va_last;
+       int bit, field;
+
+       rw_assert(&pvh_global_lock, RA_LOCKED);
+       PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+       CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
+
+       /*
+        * Transfer the 2mpage's pv entry for this mapping to the first
+        * page's pv list.  Once this transfer begins, the pv list lock
+        * must not be released until the last pv entry is reinstantiated.
+        */
+       pvh = pa_to_pvh(pa);
+       va &= ~L2_OFFSET;
+       pv = pmap_pvh_remove(pvh, pmap, va);
+       KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
+       m = PHYS_TO_VM_PAGE(pa);
+       TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
+       m->md.pv_gen++;
+       /* Instantiate the remaining 511 pv entries. */
+       va_last = va + L2_SIZE - PAGE_SIZE;
+       for (;;) {
+               pc = TAILQ_FIRST(&pmap->pm_pvchunk);
+               KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
+                   pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare"));
+               for (field = 0; field < _NPCM; field++) {
+                       while (pc->pc_map[field] != 0) {
+                               bit = ffsl(pc->pc_map[field]) - 1;
+                               pc->pc_map[field] &= ~(1ul << bit);
+                               pv = &pc->pc_pventry[field * 64 + bit];
+                               va += PAGE_SIZE;
+                               pv->pv_va = va;
+                               m++;
+                               KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+                           ("pmap_pv_demote_l2: page %p is not managed", m));
+                               TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
+                               m->md.pv_gen++;
+                               if (va == va_last)
+                                       goto out;
+                       }
+               }
+               TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+               TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
+       }
+out:
+       if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
+               TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+               TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
+       }
+       /* XXX PV stats */
+}
+
+#if VM_NRESERVLEVEL > 0
+static void
+pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+    struct rwlock **lockp)
+{
+       struct md_page *pvh;
+       pv_entry_t pv;
+       vm_page_t m;
+       vm_offset_t va_last;
+
+       rw_assert(&pvh_global_lock, RA_LOCKED);
+       KASSERT((va & L2_OFFSET) == 0,
+           ("pmap_pv_promote_l2: misaligned va %#lx", va));
+
+       CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
+
+       m = PHYS_TO_VM_PAGE(pa);
+       pv = pmap_pvh_remove(&m->md, pmap, va);
+       KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va));
+       pvh = pa_to_pvh(pa);
+       TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
+       pvh->pv_gen++;
+
+       va_last = va + L2_SIZE - PAGE_SIZE;
+       do {
+               m++;
+               va += PAGE_SIZE;
+               pmap_pvh_free(&m->md, pmap, va);
+       } while (va < va_last);
+}
+#endif /* VM_NRESERVLEVEL > 0 */
+
+/*
+ * Create the PV entry for a 2MB page mapping.  Always returns true unless the
+ * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
+ * false if the PV entry cannot be allocated without resorting to reclamation.
+ */
+static bool
+pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
+    struct rwlock **lockp)
+{
+       struct md_page *pvh;
+       pv_entry_t pv;
+       vm_paddr_t pa;
+
+       PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+       /* Pass NULL instead of the lock pointer to disable reclamation. */
+       if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
+           NULL : lockp)) == NULL)
+               return (false);
+       pv->pv_va = va;
+       pa = PTE_TO_PHYS(l2e);
+       CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
+       pvh = pa_to_pvh(pa);
+       TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
+       pvh->pv_gen++;
+       return (true);
+}
+
+static void
+pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
+{
+       pt_entry_t newl2, oldl2;
+       vm_page_t ml3;
+       vm_paddr_t ml3pa;
+
+       KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
+       KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
+       PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+
+       ml3 = pmap_remove_pt_page(pmap, va);
+       if (ml3 == NULL)
+               panic("pmap_remove_kernel_l2: Missing pt page");
+
+       ml3pa = VM_PAGE_TO_PHYS(ml3);
+       newl2 = ml3pa | PTE_V;
+
+       /*
+        * Initialize the page table page.
+        */
+       pagezero((void *)PHYS_TO_DMAP(ml3pa));
+
+       /*
+        * Demote the mapping.
+        */
+       oldl2 = pmap_load_store(l2, newl2);
+       KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
+           __func__, l2, oldl2));
+}
+
+/*
+ * pmap_remove_l2: Do the things to unmap a level 2 superpage.
+ */
+static int
+pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
+    pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
+{
+       struct md_page *pvh;
+       pt_entry_t oldl2;
+       vm_offset_t eva, va;
+       vm_page_t m, ml3;
+
+       PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+       KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
+       oldl2 = pmap_load_clear(l2);
+       KASSERT((oldl2 & PTE_RWX) != 0,
+           ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2));
+
+       /*
+        * The sfence.vma documentation states that it is sufficient to specify
+        * a single address within a superpage mapping.  However, since we do
+        * not perform any invalidation upon promotion, TLBs may still be
+        * caching 4KB mappings within the superpage, so we must invalidate the
+        * entire range.
+        */
+       pmap_invalidate_range(pmap, sva, sva + L2_SIZE);
+       if ((oldl2 & PTE_SW_WIRED) != 0)
+               pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
+       pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
+       if ((oldl2 & PTE_SW_MANAGED) != 0) {
+               CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2));
+               pvh = pa_to_pvh(PTE_TO_PHYS(oldl2));
+               pmap_pvh_free(pvh, pmap, sva);
+               eva = sva + L2_SIZE;
+               for (va = sva, m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl2));
+                   va < eva; va += PAGE_SIZE, m++) {
+                       if ((oldl2 & PTE_D) != 0)
+                               vm_page_dirty(m);
+                       if ((oldl2 & PTE_A) != 0)
+                               vm_page_aflag_set(m, PGA_REFERENCED);
+                       if (TAILQ_EMPTY(&m->md.pv_list) &&
+                           TAILQ_EMPTY(&pvh->pv_list))
+                               vm_page_aflag_clear(m, PGA_WRITEABLE);
+               }
+       }
+       if (pmap == kernel_pmap) {
+               pmap_remove_kernel_l2(pmap, l2, sva);
+       } else {
+               ml3 = pmap_remove_pt_page(pmap, sva);
+               if (ml3 != NULL) {
+                       pmap_resident_count_dec(pmap, 1);
+                       KASSERT(ml3->wire_count == Ln_ENTRIES,
+                           ("pmap_remove_l2: l3 page wire count error"));
+                       ml3->wire_count = 1;
+                       vm_page_unwire_noq(ml3);
+                       pmap_add_delayed_free_list(ml3, free, FALSE);
+               }
+       }
+       return (pmap_unuse_pt(pmap, sva, l1e, free));
+}
+
+/*
  * pmap_remove_l3: do the things to unmap a page in a process
  */
 static int
@@ -1687,7 +2091,7 @@ pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_
                pmap_pvh_free(&m->md, pmap, va);
        }
 
-       return (pmap_unuse_l3(pmap, va, l2e, free));
+       return (pmap_unuse_pt(pmap, va, l2e, free));
 }
 
 /*
@@ -1699,11 +2103,11 @@ pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
+       struct spglist free;
        struct rwlock *lock;
        vm_offset_t va, va_next;
-       pd_entry_t *l1, *l2;
-       pt_entry_t l3_pte, *l3;
-       struct spglist free;
+       pd_entry_t *l1, *l2, l2e;
+       pt_entry_t *l3;
 
        /*
         * Perform an unsynchronized read.  This is, however, safe.
@@ -1739,16 +2143,22 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t 
                l2 = pmap_l1_to_l2(l1, sva);
                if (l2 == NULL)
                        continue;
-
-               l3_pte = pmap_load(l2);
-
-               /*
-                * Weed out invalid mappings.
-                */
-               if (l3_pte == 0)
+               if ((l2e = pmap_load(l2)) == 0)
                        continue;
-               if ((pmap_load(l2) & PTE_RX) != 0)
-                       continue;
+               if ((l2e & PTE_RWX) != 0) {
+                       if (sva + L2_SIZE == va_next && eva >= va_next) {
+                               (void)pmap_remove_l2(pmap, l2, sva,
+                                   pmap_load(l1), &free, &lock);
+                               continue;
+                       } else if (!pmap_demote_l2_locked(pmap, l2, sva,
+                           &lock)) {
+                               /*
+                                * The large page mapping was destroyed.
+                                */
+                               continue;
+                       }
+                       l2e = pmap_load(l2);
+               }
 
                /*
                 * Limit our scan to either the end of the va represented
@@ -1761,8 +2171,6 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t 
                va = va_next;
                for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
                    sva += L3_SIZE) {
-                       if (l3 == NULL)
-                               panic("l3 == NULL");
                        if (pmap_load(l3) == 0) {
                                if (va != va_next) {
                                        pmap_invalidate_range(pmap, va, sva);
@@ -1772,8 +2180,7 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t 
                        }
                        if (va == va_next)
                                va = sva;
-                       if (pmap_remove_l3(pmap, l3, sva, l3_pte, &free,
-                           &lock)) {
+                       if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) {
                                sva += L3_SIZE;
                                break;
                        }
@@ -1783,7 +2190,7 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t 
        }
        if (lock != NULL)
                rw_wunlock(lock);
-       rw_runlock(&pvh_global_lock);   
+       rw_runlock(&pvh_global_lock);
        PMAP_UNLOCK(pmap);
        vm_page_free_pages_toq(&free, false);
 }
@@ -1804,42 +2211,54 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t 
 void
 pmap_remove_all(vm_page_t m)
 {
-       pv_entry_t pv;
-       pmap_t pmap;
-       pt_entry_t *l3, tl3;
-       pd_entry_t *l2, tl2;
        struct spglist free;
+       struct md_page *pvh;
+       pmap_t pmap;
+       pt_entry_t *l3, l3e;
+       pd_entry_t *l2, l2e;
+       pv_entry_t pv;
+       vm_offset_t va;
 
        KASSERT((m->oflags & VPO_UNMANAGED) == 0,
            ("pmap_remove_all: page %p is not managed", m));
        SLIST_INIT(&free);
+       pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
+           pa_to_pvh(VM_PAGE_TO_PHYS(m));
+
        rw_wlock(&pvh_global_lock);
+       while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
+               pmap = PV_PMAP(pv);
+               PMAP_LOCK(pmap);
+               va = pv->pv_va;
+               l2 = pmap_l2(pmap, va);
+               (void)pmap_demote_l2(pmap, l2, va);
+               PMAP_UNLOCK(pmap);
+       }
        while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
                pmap = PV_PMAP(pv);
                PMAP_LOCK(pmap);
                pmap_resident_count_dec(pmap, 1);
                l2 = pmap_l2(pmap, pv->pv_va);
                KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found"));
-               tl2 = pmap_load(l2);
+               l2e = pmap_load(l2);
 
-               KASSERT((tl2 & PTE_RX) == 0,
-                   ("pmap_remove_all: found a table when expecting "
-                   "a block in %p's pv list", m));
+               KASSERT((l2e & PTE_RX) == 0,
+                   ("pmap_remove_all: found a superpage in %p's pv list", m));
 
                l3 = pmap_l2_to_l3(l2, pv->pv_va);
-               tl3 = pmap_load_clear(l3);
+               l3e = pmap_load_clear(l3);
                pmap_invalidate_page(pmap, pv->pv_va);
-               if (tl3 & PTE_SW_WIRED)
+               if (l3e & PTE_SW_WIRED)
                        pmap->pm_stats.wired_count--;
-               if ((tl3 & PTE_A) != 0)
+               if ((l3e & PTE_A) != 0)
                        vm_page_aflag_set(m, PGA_REFERENCED);
 
                /*
                 * Update the vm_page_t clean and reference bits.
                 */
-               if ((tl3 & PTE_D) != 0)
+               if ((l3e & PTE_D) != 0)
                        vm_page_dirty(m);
-               pmap_unuse_l3(pmap, pv->pv_va, pmap_load(l2), &free);
+               pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free);
                TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
                m->md.pv_gen++;
                free_pv_entry(pmap, pv);
@@ -1857,10 +2276,12 @@ pmap_remove_all(vm_page_t m)
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
-       pd_entry_t *l1, *l2;
+       pd_entry_t *l1, *l2, l2e;
        pt_entry_t *l3, l3e, mask;
        vm_page_t m;
-       vm_offset_t va_next;
+       vm_paddr_t pa;
+       vm_offset_t va, va_next;
+       bool anychanged, pv_lists_locked;
 
        if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
                pmap_remove(pmap, sva, eva);
@@ -1871,12 +2292,14 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t
            (VM_PROT_WRITE | VM_PROT_EXECUTE))
                return;
 
+       anychanged = false;
+       pv_lists_locked = false;
        mask = 0;
        if ((prot & VM_PROT_WRITE) == 0)
                mask |= PTE_W | PTE_D;
        if ((prot & VM_PROT_EXECUTE) == 0)
                mask |= PTE_X;
-
+resume:
        PMAP_LOCK(pmap);
        for (; sva < eva; sva = va_next) {
                l1 = pmap_l1(pmap, sva);
@@ -1892,10 +2315,41 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t
                        va_next = eva;
 
                l2 = pmap_l1_to_l2(l1, sva);
-               if (l2 == NULL || pmap_load(l2) == 0)
+               if (l2 == NULL || (l2e = pmap_load(l2)) == 0)
                        continue;
-               if ((pmap_load(l2) & PTE_RX) != 0)
-                       continue;
+               if ((l2e & PTE_RWX) != 0) {
+                       if (sva + L2_SIZE == va_next && eva >= va_next) {
+retryl2:
+                               if ((l2e & (PTE_SW_MANAGED | PTE_D)) ==
+                                   (PTE_SW_MANAGED | PTE_D)) {
+                                       pa = PTE_TO_PHYS(l2e);
+                                       for (va = sva, m = PHYS_TO_VM_PAGE(pa);
+                                           va < va_next; m++, va += PAGE_SIZE)
+                                               vm_page_dirty(m);
+                               }
+                               if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask))
+                                       goto retryl2;
+                               anychanged = true;
+                       } else {
+                               if (!pv_lists_locked) {
+                                       pv_lists_locked = true;
+                                       if (!rw_try_rlock(&pvh_global_lock)) {
+                                               if (anychanged)
+                                                       pmap_invalidate_all(
+                                                           pmap);
+                                               PMAP_UNLOCK(pmap);
+                                               rw_rlock(&pvh_global_lock);
+                                               goto resume;
+                                       }
+                               }
+                               if (!pmap_demote_l2(pmap, l2, sva)) {
+                                       /*
+                                        * The large page mapping was destroyed.
+                                        */
+                                       continue;
+                               }
+                       }
+               }
 
                if (va_next > eva)
                        va_next = eva;
@@ -1903,7 +2357,7 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t
                for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
                    sva += L3_SIZE) {
                        l3e = pmap_load(l3);
-retry:
+retryl3:
                        if ((l3e & PTE_V) == 0)
                                continue;
                        if ((prot & VM_PROT_WRITE) == 0 &&
@@ -1913,60 +2367,236 @@ retry:
                                vm_page_dirty(m);
                        }
                        if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask))
-                               goto retry;
-                       /* XXX: Use pmap_invalidate_range */
-                       pmap_invalidate_page(pmap, sva);
+                               goto retryl3;
+                       anychanged = true;
                }
        }
+       if (anychanged)
+               pmap_invalidate_all(pmap);
+       if (pv_lists_locked)
+               rw_runlock(&pvh_global_lock);
        PMAP_UNLOCK(pmap);
 }
 
 int
 pmap_fault_fixup(pmap_t pmap, vm_offset_t va, vm_prot_t ftype)
 {
-       pt_entry_t orig_l3;
-       pt_entry_t new_l3;
-       pt_entry_t *l3;
+       pd_entry_t *l2, l2e;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to