Author: kib
Date: Wed Feb 20 09:51:13 2019
New Revision: 344353
URL: https://svnweb.freebsd.org/changeset/base/344353

Log:
  Add kernel support for Intel userspace protection keys feature on
  Skylake Xeons.
  
  See SDM rev. 68 Vol 3 4.6.2 Protection Keys and the description of the
  RDPKRU and WRPKRU instructions.
  
  Reviewed by:  markj
  Tested by:    pho
  Sponsored by: The FreeBSD Foundation
  MFC after:    2 weeks
  Differential revision:        https://reviews.freebsd.org/D18893

Modified:
  head/sys/amd64/amd64/initcpu.c
  head/sys/amd64/amd64/pmap.c
  head/sys/amd64/amd64/sys_machdep.c
  head/sys/amd64/amd64/trap.c
  head/sys/amd64/include/pmap.h
  head/sys/arm/include/pmap.h
  head/sys/arm64/include/pmap.h
  head/sys/i386/include/pmap.h
  head/sys/mips/include/pmap.h
  head/sys/powerpc/include/pmap.h
  head/sys/riscv/include/pmap.h
  head/sys/sparc64/include/pmap.h
  head/sys/vm/vm_fault.c
  head/sys/vm/vm_map.c
  head/sys/x86/include/sysarch.h

Modified: head/sys/amd64/amd64/initcpu.c
==============================================================================
--- head/sys/amd64/amd64/initcpu.c      Wed Feb 20 09:46:44 2019        
(r344352)
+++ head/sys/amd64/amd64/initcpu.c      Wed Feb 20 09:51:13 2019        
(r344353)
@@ -233,6 +233,9 @@ initializecpu(void)
        if (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE)
                cr4 |= CR4_FSGSBASE;
 
+       if (cpu_stdext_feature2 & CPUID_STDEXT2_PKU)
+               cr4 |= CR4_PKE;
+
        /*
         * Postpone enabling the SMEP on the boot CPU until the page
         * tables are switched from the boot loader identity mapping

Modified: head/sys/amd64/amd64/pmap.c
==============================================================================
--- head/sys/amd64/amd64/pmap.c Wed Feb 20 09:46:44 2019        (r344352)
+++ head/sys/amd64/amd64/pmap.c Wed Feb 20 09:51:13 2019        (r344353)
@@ -48,7 +48,7 @@
  */
 /*-
  * Copyright (c) 2003 Networks Associates Technology, Inc.
- * Copyright (c) 2014-2018 The FreeBSD Foundation
+ * Copyright (c) 2014-2019 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jake Burkholder,
@@ -121,6 +121,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/rangeset.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/turnstile.h>
@@ -155,6 +156,7 @@ __FBSDID("$FreeBSD$");
 #ifdef SMP
 #include <machine/smp.h>
 #endif
+#include <machine/sysarch.h>
 #include <machine/tss.h>
 
 static __inline boolean_t
@@ -285,6 +287,13 @@ pmap_modified_bit(pmap_t pmap)
        return (mask);
 }
 
+static __inline pt_entry_t
+pmap_pku_mask_bit(pmap_t pmap)
+{
+
+       return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0);
+}
+
 #if !defined(DIAGNOSTIC)
 #ifdef __GNUC_GNU_INLINE__
 #define PMAP_INLINE    __attribute__((__gnu_inline__)) inline
@@ -424,6 +433,22 @@ static pml4_entry_t *pti_pml4;
 static vm_pindex_t pti_pg_idx;
 static bool pti_finalized;
 
+struct pmap_pkru_range {
+       struct rs_el    pkru_rs_el;
+       u_int           pkru_keyidx;
+       int             pkru_flags;
+};
+
+static uma_zone_t pmap_pkru_ranges_zone;
+static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
+static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va);
+static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
+static void *pkru_dup_range(void *ctx, void *data);
+static void pkru_free_range(void *ctx, void *node);
+static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap);
+static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
+static void pmap_pkru_deassign_all(pmap_t pmap);
+
 static int
 pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
 {
@@ -2846,6 +2871,12 @@ pmap_pinit0(pmap_t pmap)
                pmap->pm_pcids[i].pm_gen = 1;
        }
        pmap_activate_boot(pmap);
+
+       if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
+               pmap_pkru_ranges_zone = uma_zcreate("pkru ranges",
+                   sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL,
+                   UMA_ALIGN_PTR, 0);
+       }
 }
 
 void
@@ -2934,6 +2965,10 @@ pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, i
                        pmap_pinit_pml4_pti(pml4pgu);
                        pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu);
                }
+               if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
+                       rangeset_init(&pmap->pm_pkru, pkru_dup_range,
+                           pkru_free_range, pmap, M_NOWAIT);
+               }
        }
 
        pmap->pm_root.rt_root = 0;
@@ -3230,6 +3265,9 @@ pmap_release(pmap_t pmap)
                vm_page_unwire_noq(m);
                vm_page_free(m);
        }
+       if (pmap->pm_type == PT_X86 &&
+           (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0)
+               rangeset_fini(&pmap->pm_pkru);
 }
 
 static int
@@ -4060,7 +4098,7 @@ pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, v
 {
        pd_entry_t newpde, oldpde;
        pt_entry_t *firstpte, newpte;
-       pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V;
+       pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V;
        vm_paddr_t mptepa;
        vm_page_t mpte;
        struct spglist free;
@@ -4073,6 +4111,7 @@ pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, v
        PG_RW = pmap_rw_bit(pmap);
        PG_V = pmap_valid_bit(pmap);
        PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
+       PG_PKU_MASK = pmap_pku_mask_bit(pmap);
 
        PMAP_LOCK_ASSERT(pmap, MA_OWNED);
        oldpde = *pde;
@@ -4505,6 +4544,7 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t 
 out:
        if (anyvalid)
                pmap_invalidate_all(pmap);
+       pmap_pkru_on_remove(pmap, sva, eva);
        PMAP_UNLOCK(pmap);
        pmap_delayed_invl_finished();
        vm_page_free_pages_toq(&free, true);
@@ -4816,7 +4856,7 @@ pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offs
 {
        pd_entry_t newpde;
        pt_entry_t *firstpte, oldpte, pa, *pte;
-       pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V;
+       pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V, PG_PKU_MASK;
        vm_page_t mpte;
        int PG_PTE_CACHE;
 
@@ -4825,6 +4865,7 @@ pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offs
        PG_M = pmap_modified_bit(pmap);
        PG_V = pmap_valid_bit(pmap);
        PG_RW = pmap_rw_bit(pmap);
+       PG_PKU_MASK = pmap_pku_mask_bit(pmap);
        PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
 
        PMAP_LOCK_ASSERT(pmap, MA_OWNED);
@@ -5052,6 +5093,8 @@ retry:
 
        origpte = *pte;
        pv = NULL;
+       if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86)
+               newpte |= pmap_pkru_get(pmap, va);
 
        /*
         * Is the specified virtual address already mapped?
@@ -5271,6 +5314,25 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t
                    " in pmap %p", va, pmap);
                return (KERN_RESOURCE_SHORTAGE);
        }
+
+       /*
+        * If pkru is not same for the whole pde range, return failure
+        * and let vm_fault() cope.  Check after pde allocation, since
+        * it could sleep.
+        */
+       if (!pmap_pkru_same(pmap, va, va + NBPDR)) {
+               SLIST_INIT(&free);
+               if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
+                       pmap_invalidate_page(pmap, va);
+                       vm_page_free_pages_toq(&free, true);
+               }
+               return (KERN_FAILURE);
+       }
+       if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) {
+               newpde &= ~X86_PG_PKU_MASK;
+               newpde |= pmap_pkru_get(pmap, va);
+       }
+
        pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
        pde = &pde[pmap_pde_index(va)];
        oldpde = *pde;
@@ -5530,7 +5592,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, v
        if ((prot & VM_PROT_EXECUTE) == 0)
                newpte |= pg_nx;
        if (va < VM_MAXUSER_ADDRESS)
-               newpte |= PG_U;
+               newpte |= PG_U | pmap_pkru_get(pmap, va);
        pte_store(pte, newpte);
        return (mpte);
 }
@@ -5906,6 +5968,36 @@ out:
        PMAP_UNLOCK(dst_pmap);
 }
 
+int
+pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap)
+{
+       int error;
+
+       if (dst_pmap->pm_type != src_pmap->pm_type ||
+           dst_pmap->pm_type != PT_X86 ||
+           (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0)
+               return (0);
+       for (;;) {
+               if (dst_pmap < src_pmap) {
+                       PMAP_LOCK(dst_pmap);
+                       PMAP_LOCK(src_pmap);
+               } else {
+                       PMAP_LOCK(src_pmap);
+                       PMAP_LOCK(dst_pmap);
+               }
+               error = pmap_pkru_copy(dst_pmap, src_pmap);
+               /* Clean up partial copy on failure due to no memory. */
+               if (error == ENOMEM)
+                       pmap_pkru_deassign_all(dst_pmap);
+               PMAP_UNLOCK(src_pmap);
+               PMAP_UNLOCK(dst_pmap);
+               if (error != ENOMEM)
+                       break;
+               vm_wait(NULL);
+       }
+       return (error);
+}
+
 /*
  * Zero the specified hardware page.
  */
@@ -6305,6 +6397,7 @@ pmap_remove_pages(pmap_t pmap)
        if (lock != NULL)
                rw_wunlock(lock);
        pmap_invalidate_all(pmap);
+       pmap_pkru_deassign_all(pmap);
        PMAP_UNLOCK(pmap);
        vm_page_free_pages_toq(&free, true);
 }
@@ -8939,6 +9032,285 @@ pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva)
        }
        pmap_invalidate_range(kernel_pmap, sva, eva);
        VM_OBJECT_WUNLOCK(pti_obj);
+}
+
+static void *
+pkru_dup_range(void *ctx __unused, void *data)
+{
+       struct pmap_pkru_range *node, *new_node;
+
+       new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT);
+       if (new_node == NULL)
+               return (NULL);
+       node = data;
+       memcpy(new_node, node, sizeof(*node));
+       return (new_node);
+}
+
+static void
+pkru_free_range(void *ctx __unused, void *node)
+{
+
+       uma_zfree(pmap_pkru_ranges_zone, node);
+}
+
+static int
+pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx,
+    int flags)
+{
+       struct pmap_pkru_range *ppr;
+       int error;
+
+       PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+       MPASS(pmap->pm_type == PT_X86);
+       MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
+       if ((flags & AMD64_PKRU_EXCL) != 0 &&
+           !rangeset_check_empty(&pmap->pm_pkru, sva, eva))
+               return (EBUSY);
+       ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT);
+       if (ppr == NULL)
+               return (ENOMEM);
+       ppr->pkru_keyidx = keyidx;
+       ppr->pkru_flags = flags & AMD64_PKRU_PERSIST;
+       error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr);
+       if (error != 0)
+               uma_zfree(pmap_pkru_ranges_zone, ppr);
+       return (error);
+}
+
+static int
+pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+
+       PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+       MPASS(pmap->pm_type == PT_X86);
+       MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
+       return (rangeset_remove(&pmap->pm_pkru, sva, eva));
+}
+
+static void
+pmap_pkru_deassign_all(pmap_t pmap)
+{
+
+       PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+       if (pmap->pm_type == PT_X86 &&
+           (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0)
+               rangeset_remove_all(&pmap->pm_pkru);
+}
+
+static bool
+pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+       struct pmap_pkru_range *ppr, *prev_ppr;
+       vm_offset_t va;
+
+       PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+       if (pmap->pm_type != PT_X86 ||
+           (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 ||
+           sva >= VM_MAXUSER_ADDRESS)
+               return (true);
+       MPASS(eva <= VM_MAXUSER_ADDRESS);
+       for (va = sva, prev_ppr = NULL; va < eva;) {
+               ppr = rangeset_lookup(&pmap->pm_pkru, va);
+               if ((ppr == NULL) ^ (prev_ppr == NULL))
+                       return (false);
+               if (ppr == NULL) {
+                       va += PAGE_SIZE;
+                       continue;
+               }
+               if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx)
+                       return (false);
+               va = ppr->pkru_rs_el.re_end;
+       }
+       return (true);
+}
+
+static pt_entry_t
+pmap_pkru_get(pmap_t pmap, vm_offset_t va)
+{
+       struct pmap_pkru_range *ppr;
+
+       PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+       if (pmap->pm_type != PT_X86 ||
+           (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 ||
+           va >= VM_MAXUSER_ADDRESS)
+               return (0);
+       ppr = rangeset_lookup(&pmap->pm_pkru, va);
+       if (ppr != NULL)
+               return (X86_PG_PKU(ppr->pkru_keyidx));
+       return (0);
+}
+
+static bool
+pred_pkru_on_remove(void *ctx __unused, void *r)
+{
+       struct pmap_pkru_range *ppr;
+
+       ppr = r;
+       return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0);
+}
+
+static void
+pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+
+       PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+       if (pmap->pm_type == PT_X86 &&
+           (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
+               rangeset_remove_pred(&pmap->pm_pkru, sva, eva,
+                   pred_pkru_on_remove);
+       }
+}
+
+static int
+pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap)
+{
+
+       PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED);
+       PMAP_LOCK_ASSERT(src_pmap, MA_OWNED);
+       MPASS(dst_pmap->pm_type == PT_X86);
+       MPASS(src_pmap->pm_type == PT_X86);
+       MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
+       if (src_pmap->pm_pkru.rs_data_ctx == NULL)
+               return (0);
+       return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru));
+}
+
+static void
+pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
+    u_int keyidx)
+{
+       pml4_entry_t *pml4e;
+       pdp_entry_t *pdpe;
+       pd_entry_t newpde, ptpaddr, *pde;
+       pt_entry_t newpte, *ptep, pte;
+       vm_offset_t va, va_next;
+       bool changed;
+
+       PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+       MPASS(pmap->pm_type == PT_X86);
+       MPASS(keyidx <= PMAP_MAX_PKRU_IDX);
+
+       for (changed = false, va = sva; va < eva; va = va_next) {
+               pml4e = pmap_pml4e(pmap, va);
+               if ((*pml4e & X86_PG_V) == 0) {
+                       va_next = (va + NBPML4) & ~PML4MASK;
+                       if (va_next < va)
+                               va_next = eva;
+                       continue;
+               }
+
+               pdpe = pmap_pml4e_to_pdpe(pml4e, va);
+               if ((*pdpe & X86_PG_V) == 0) {
+                       va_next = (va + NBPDP) & ~PDPMASK;
+                       if (va_next < va)
+                               va_next = eva;
+                       continue;
+               }
+
+               va_next = (va + NBPDR) & ~PDRMASK;
+               if (va_next < va)
+                       va_next = eva;
+
+               pde = pmap_pdpe_to_pde(pdpe, va);
+               ptpaddr = *pde;
+               if (ptpaddr == 0)
+                       continue;
+
+               MPASS((ptpaddr & X86_PG_V) != 0);
+               if ((ptpaddr & PG_PS) != 0) {
+                       if (va + NBPDR == va_next && eva >= va_next) {
+                               newpde = (ptpaddr & ~X86_PG_PKU_MASK) |
+                                   X86_PG_PKU(keyidx);
+                               if (newpde != ptpaddr) {
+                                       *pde = newpde;
+                                       changed = true;
+                               }
+                               continue;
+                       } else if (!pmap_demote_pde(pmap, pde, va)) {
+                               continue;
+                       }
+               }
+
+               if (va_next > eva)
+                       va_next = eva;
+
+               for (ptep = pmap_pde_to_pte(pde, va); va != va_next;
+                   ptep++, va += PAGE_SIZE) {
+                       pte = *ptep;
+                       if ((pte & X86_PG_V) == 0)
+                               continue;
+                       newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx);
+                       if (newpte != pte) {
+                               *ptep = newpte;
+                               changed = true;
+                       }
+               }
+       }
+       if (changed)
+               pmap_invalidate_range(pmap, sva, eva);
+}
+
+static int
+pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
+    u_int keyidx, int flags)
+{
+
+       if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX ||
+           (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0)
+               return (EINVAL);
+       if (eva <= sva || eva > VM_MAXUSER_ADDRESS)
+               return (EFAULT);
+       if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0)
+               return (ENOTSUP);
+       return (0);
+}
+
+int
+pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx,
+    int flags)
+{
+       int error;
+
+       sva = trunc_page(sva);
+       eva = round_page(eva);
+       error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags);
+       if (error != 0)
+               return (error);
+       for (;;) {
+               PMAP_LOCK(pmap);
+               error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags);
+               if (error == 0)
+                       pmap_pkru_update_range(pmap, sva, eva, keyidx);
+               PMAP_UNLOCK(pmap);
+               if (error != ENOMEM)
+                       break;
+               vm_wait(NULL);
+       }
+       return (error);
+}
+
+int
+pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+       int error;
+
+       sva = trunc_page(sva);
+       eva = round_page(eva);
+       error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0);
+       if (error != 0)
+               return (error);
+       for (;;) {
+               PMAP_LOCK(pmap);
+               error = pmap_pkru_deassign(pmap, sva, eva);
+               if (error == 0)
+                       pmap_pkru_update_range(pmap, sva, eva, 0);
+               PMAP_UNLOCK(pmap);
+               if (error != ENOMEM)
+                       break;
+               vm_wait(NULL);
+       }
+       return (error);
 }
 
 #include "opt_ddb.h"

Modified: head/sys/amd64/amd64/sys_machdep.c
==============================================================================
--- head/sys/amd64/amd64/sys_machdep.c  Wed Feb 20 09:46:44 2019        
(r344352)
+++ head/sys/amd64/amd64/sys_machdep.c  Wed Feb 20 09:51:13 2019        
(r344353)
@@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
+#include <sys/pcpu.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
@@ -53,6 +54,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>                /* for kernel_map */
+#include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 
 #include <machine/frame.h>
@@ -170,13 +172,16 @@ update_gdt_fsbase(struct thread *td, uint32_t base)
 int
 sysarch(struct thread *td, struct sysarch_args *uap)
 {
-       int error = 0;
-       struct pcb *pcb = curthread->td_pcb;
+       struct pcb *pcb;
+       struct vm_map *map;
        uint32_t i386base;
        uint64_t a64base;
        struct i386_ioperm_args iargs;
        struct i386_get_xfpustate i386xfpu;
+       struct i386_set_pkru i386pkru;
        struct amd64_get_xfpustate a64xfpu;
+       struct amd64_set_pkru a64pkru;
+       int error;
 
 #ifdef CAPABILITY_MODE
        /*
@@ -194,11 +199,15 @@ sysarch(struct thread *td, struct sysarch_args *uap)
                case I386_GET_GSBASE:
                case I386_SET_GSBASE:
                case I386_GET_XFPUSTATE:
+               case I386_SET_PKRU:
+               case I386_CLEAR_PKRU:
                case AMD64_GET_FSBASE:
                case AMD64_SET_FSBASE:
                case AMD64_GET_GSBASE:
                case AMD64_SET_GSBASE:
                case AMD64_GET_XFPUSTATE:
+               case AMD64_SET_PKRU:
+               case AMD64_CLEAR_PKRU:
                        break;
 
                case I386_SET_IOPERM:
@@ -214,6 +223,10 @@ sysarch(struct thread *td, struct sysarch_args *uap)
 
        if (uap->op == I386_GET_LDT || uap->op == I386_SET_LDT)
                return (sysarch_ldt(td, uap, UIO_USERSPACE));
+
+       error = 0;
+       pcb = td->td_pcb;
+
        /*
         * XXXKIB check that the BSM generation code knows to encode
         * the op argument.
@@ -233,11 +246,27 @@ sysarch(struct thread *td, struct sysarch_args *uap)
                a64xfpu.addr = (void *)(uintptr_t)i386xfpu.addr;
                a64xfpu.len = i386xfpu.len;
                break;
+       case I386_SET_PKRU:
+       case I386_CLEAR_PKRU:
+               if ((error = copyin(uap->parms, &i386pkru,
+                   sizeof(struct i386_set_pkru))) != 0)
+                       return (error);
+               a64pkru.addr = (void *)(uintptr_t)i386pkru.addr;
+               a64pkru.len = i386pkru.len;
+               a64pkru.keyidx = i386pkru.keyidx;
+               a64pkru.flags = i386pkru.flags;
+               break;
        case AMD64_GET_XFPUSTATE:
                if ((error = copyin(uap->parms, &a64xfpu,
                    sizeof(struct amd64_get_xfpustate))) != 0)
                        return (error);
                break;
+       case AMD64_SET_PKRU:
+       case AMD64_CLEAR_PKRU:
+               if ((error = copyin(uap->parms, &a64pkru,
+                   sizeof(struct amd64_set_pkru))) != 0)
+                       return (error);
+               break;
        default:
                break;
        }
@@ -324,6 +353,34 @@ sysarch(struct thread *td, struct sysarch_args *uap)
                fpugetregs(td);
                error = copyout((char *)(get_pcb_user_save_td(td) + 1),
                    a64xfpu.addr, a64xfpu.len);
+               break;
+
+       case I386_SET_PKRU:
+       case AMD64_SET_PKRU:
+               /*
+                * Read-lock the map to synchronize with parallel
+                * pmap_vmspace_copy() on fork.
+                */
+               map = &td->td_proc->p_vmspace->vm_map;
+               vm_map_lock_read(map);
+               error = pmap_pkru_set(PCPU_GET(curpmap),
+                   (vm_offset_t)a64pkru.addr, (vm_offset_t)a64pkru.addr +
+                   a64pkru.len, a64pkru.keyidx, a64pkru.flags);
+               vm_map_unlock_read(map);
+               break;
+
+       case I386_CLEAR_PKRU:
+       case AMD64_CLEAR_PKRU:
+               if (a64pkru.flags != 0 || a64pkru.keyidx != 0) {
+                       error = EINVAL;
+                       break;
+               }
+               map = &td->td_proc->p_vmspace->vm_map;
+               vm_map_lock_read(map);
+               error = pmap_pkru_clear(PCPU_GET(curpmap),
+                   (vm_offset_t)a64pkru.addr,
+                   (vm_offset_t)a64pkru.addr + a64pkru.len);
+               vm_map_unlock(map);
                break;
 
        default:

Modified: head/sys/amd64/amd64/trap.c
==============================================================================
--- head/sys/amd64/amd64/trap.c Wed Feb 20 09:46:44 2019        (r344352)
+++ head/sys/amd64/amd64/trap.c Wed Feb 20 09:51:13 2019        (r344353)
@@ -808,6 +808,20 @@ trap_pfault(struct trapframe *frame, int usermode)
        }
 
        /*
+        * User-mode protection key violation (PKU).  May happen
+        * either from usermode or from kernel if copyin accessed
+        * key-protected mapping.
+        */
+       if ((frame->tf_err & PGEX_PK) != 0) {
+               if (eva > VM_MAXUSER_ADDRESS) {
+                       trap_fatal(frame, eva);
+                       return (-1);
+               }
+               rv = KERN_PROTECTION_FAILURE;
+               goto after_vmfault;
+       }
+
+       /*
         * If nx protection of the usermode portion of kernel page
         * tables caused trap, panic.
         */
@@ -842,6 +856,7 @@ trap_pfault(struct trapframe *frame, int usermode)
 #endif
                return (0);
        }
+after_vmfault:
        if (!usermode) {
                if (td->td_intr_nesting_level == 0 &&
                    curpcb->pcb_onfault != NULL) {

Modified: head/sys/amd64/include/pmap.h
==============================================================================
--- head/sys/amd64/include/pmap.h       Wed Feb 20 09:46:44 2019        
(r344352)
+++ head/sys/amd64/include/pmap.h       Wed Feb 20 09:51:13 2019        
(r344353)
@@ -66,6 +66,7 @@
 #define        X86_PG_AVAIL2   0x400   /*   <  programmers use         */
 #define        X86_PG_AVAIL3   0x800   /*    \                         */
 #define        X86_PG_PDE_PAT  0x1000  /* PAT  PAT index               */
+#define        X86_PG_PKU(idx) ((pt_entry_t)idx << 59)
 #define        X86_PG_NX       (1ul<<63) /* No-execute */
 #define        X86_PG_AVAIL(x) (1ul << (x))
 
@@ -73,6 +74,10 @@
 #define        X86_PG_PDE_CACHE (X86_PG_PDE_PAT | X86_PG_NC_PWT | 
X86_PG_NC_PCD)
 #define        X86_PG_PTE_CACHE (X86_PG_PTE_PAT | X86_PG_NC_PWT | 
X86_PG_NC_PCD)
 
+/* Protection keys indexes */
+#define        PMAP_MAX_PKRU_IDX       0xf
+#define        X86_PG_PKU_MASK         X86_PG_PKU(PMAP_MAX_PKRU_IDX)
+
 /*
  * Intel extended page table (EPT) bit definitions.
  */
@@ -120,7 +125,7 @@
  * (PTE) page mappings have identical settings for the following fields:
  */
 #define        PG_PTE_PROMOTE  (PG_NX | PG_MANAGED | PG_W | PG_G | 
PG_PTE_CACHE | \
-           PG_M | PG_A | PG_U | PG_RW | PG_V)
+           PG_M | PG_A | PG_U | PG_RW | PG_V | PG_PKU_MASK)
 
 /*
  * Page Protection Exception bits
@@ -242,6 +247,8 @@
 #include <sys/_cpuset.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
+#include <sys/_pctrie.h>
+#include <sys/_rangeset.h>
 
 #include <vm/_vm_radix.h>
 
@@ -336,6 +343,7 @@ struct pmap {
        long                    pm_eptgen;      /* EPT pmap generation id */
        int                     pm_flags;
        struct pmap_pcids       pm_pcids[MAXCPU];
+       struct rangeset         pm_pkru;
 };
 
 /* flags */
@@ -454,6 +462,10 @@ void       pmap_pti_pcid_invalidate(uint64_t ucr3, 
uint64_t 
 void   pmap_pti_pcid_invlpg(uint64_t ucr3, uint64_t kcr3, vm_offset_t va);
 void   pmap_pti_pcid_invlrng(uint64_t ucr3, uint64_t kcr3, vm_offset_t sva,
            vm_offset_t eva);
+int    pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
+int    pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
+           u_int keyidx, int flags);
+int    pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap);
 #endif /* _KERNEL */
 
 /* Return various clipped indexes for a given VA */

Modified: head/sys/arm/include/pmap.h
==============================================================================
--- head/sys/arm/include/pmap.h Wed Feb 20 09:46:44 2019        (r344352)
+++ head/sys/arm/include/pmap.h Wed Feb 20 09:51:13 2019        (r344353)
@@ -71,5 +71,12 @@ void pmap_kremove_device(vm_offset_t, vm_size_t);
 vm_paddr_t pmap_kextract(vm_offset_t);
 #define vtophys(va)    pmap_kextract((vm_offset_t)(va))
 
+static inline int
+pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused)
+{
+
+       return (0);
+}
+
 #endif /* _KERNEL */
 #endif /* !_MACHINE_PMAP_H_ */

Modified: head/sys/arm64/include/pmap.h
==============================================================================
--- head/sys/arm64/include/pmap.h       Wed Feb 20 09:46:44 2019        
(r344352)
+++ head/sys/arm64/include/pmap.h       Wed Feb 20 09:51:13 2019        
(r344353)
@@ -171,6 +171,13 @@ struct pcb *pmap_switch(struct thread *, struct thread
 
 #define        pmap_page_is_mapped(m)  (!TAILQ_EMPTY(&(m)->md.pv_list))
 
+static inline int
+pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused)
+{
+
+       return (0);
+}
+
 #endif /* _KERNEL */
 
 #endif /* !LOCORE */

Modified: head/sys/i386/include/pmap.h
==============================================================================
--- head/sys/i386/include/pmap.h        Wed Feb 20 09:46:44 2019        
(r344352)
+++ head/sys/i386/include/pmap.h        Wed Feb 20 09:51:13 2019        
(r344353)
@@ -244,6 +244,13 @@ extern vm_offset_t virtual_end;
 #define        pmap_page_is_write_mapped(m)    (((m)->aflags & PGA_WRITEABLE) 
!= 0)
 #define        pmap_unmapbios(va, sz)  pmap_unmapdev((va), (sz))
 
+static inline int
+pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused)
+{
+
+       return (0);
+}
+
 struct sf_buf;
 
 /*

Modified: head/sys/mips/include/pmap.h
==============================================================================
--- head/sys/mips/include/pmap.h        Wed Feb 20 09:46:44 2019        
(r344352)
+++ head/sys/mips/include/pmap.h        Wed Feb 20 09:51:13 2019        
(r344353)
@@ -185,6 +185,13 @@ int pmap_emulate_modified(pmap_t pmap, vm_offset_t va)
 void pmap_page_set_memattr(vm_page_t, vm_memattr_t);
 int pmap_change_attr(vm_offset_t, vm_size_t, vm_memattr_t);
 
+static inline int
+pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused)
+{
+
+       return (0);
+}
+
 #endif                         /* _KERNEL */
 
 #endif                         /* !LOCORE */

Modified: head/sys/powerpc/include/pmap.h
==============================================================================
--- head/sys/powerpc/include/pmap.h     Wed Feb 20 09:46:44 2019        
(r344352)
+++ head/sys/powerpc/include/pmap.h     Wed Feb 20 09:51:13 2019        
(r344353)
@@ -288,6 +288,13 @@ vm_offset_t pmap_early_io_map(vm_paddr_t pa, vm_size_t
 void pmap_early_io_unmap(vm_offset_t va, vm_size_t size);
 void pmap_track_page(pmap_t pmap, vm_offset_t va);
 
+static inline int
+pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused)
+{
+
+       return (0);
+}
+
 #endif
 
 #endif /* !_MACHINE_PMAP_H_ */

Modified: head/sys/riscv/include/pmap.h
==============================================================================
--- head/sys/riscv/include/pmap.h       Wed Feb 20 09:46:44 2019        
(r344352)
+++ head/sys/riscv/include/pmap.h       Wed Feb 20 09:51:13 2019        
(r344353)
@@ -166,6 +166,13 @@ bool       pmap_get_tables(pmap_t, vm_offset_t, pd_entry_t 
*
 
 int pmap_fault_fixup(pmap_t, vm_offset_t, vm_prot_t);
 
+static inline int
+pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused)
+{
+
+       return (0);
+}
+
 #endif /* _KERNEL */
 
 #endif /* !LOCORE */

Modified: head/sys/sparc64/include/pmap.h
==============================================================================
--- head/sys/sparc64/include/pmap.h     Wed Feb 20 09:46:44 2019        
(r344352)
+++ head/sys/sparc64/include/pmap.h     Wed Feb 20 09:51:13 2019        
(r344353)
@@ -128,4 +128,11 @@ SYSCTL_DECL(_debug_pmap_stats);
 
 #endif
 
+static inline int
+pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused)
+{
+
+       return (0);
+}
+
 #endif /* !_MACHINE_PMAP_H_ */

Modified: head/sys/vm/vm_fault.c
==============================================================================
--- head/sys/vm/vm_fault.c      Wed Feb 20 09:46:44 2019        (r344352)
+++ head/sys/vm/vm_fault.c      Wed Feb 20 09:51:13 2019        (r344353)
@@ -481,8 +481,20 @@ vm_fault_populate(struct faultstate *fs, vm_prot_t pro
                            fault_flags, true);
                }
                VM_OBJECT_WUNLOCK(fs->first_object);
-               pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type | (wired ?
-                   PMAP_ENTER_WIRED : 0), psind);
+               rv = pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type |
+                   (wired ? PMAP_ENTER_WIRED : 0), psind);
+#if defined(__amd64__)
+               if (psind > 0 && rv == KERN_FAILURE) {
+                       for (i = 0; i < npages; i++) {
+                               rv = pmap_enter(fs->map->pmap, vaddr + ptoa(i),
+                                   &m[i], prot, fault_type |
+                                   (wired ? PMAP_ENTER_WIRED : 0), 0);
+                               MPASS(rv == KERN_SUCCESS);
+                       }
+               }
+#else
+               MPASS(rv == KERN_SUCCESS);
+#endif
                VM_OBJECT_WLOCK(fs->first_object);
                m_mtx = NULL;
                for (i = 0; i < npages; i++) {

Modified: head/sys/vm/vm_map.c
==============================================================================
--- head/sys/vm/vm_map.c        Wed Feb 20 09:46:44 2019        (r344352)
+++ head/sys/vm/vm_map.c        Wed Feb 20 09:51:13 2019        (r344353)
@@ -3544,7 +3544,7 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_c
        vm_map_t new_map, old_map;
        vm_map_entry_t new_entry, old_entry;
        vm_object_t object;
-       int locked;
+       int error, locked;
        vm_inherit_t inh;
 
        old_map = &vm1->vm_map;
@@ -3553,6 +3553,7 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_c
            pmap_pinit);
        if (vm2 == NULL)
                return (NULL);
+
        vm2->vm_taddr = vm1->vm_taddr;
        vm2->vm_daddr = vm1->vm_daddr;
        vm2->vm_maxsaddr = vm1->vm_maxsaddr;
@@ -3563,7 +3564,17 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_c
        locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */
        KASSERT(locked, ("vmspace_fork: lock failed"));
 
+       error = pmap_vmspace_copy(new_map->pmap, old_map->pmap);
+       if (error != 0) {
+               sx_xunlock(&old_map->lock);
+               sx_xunlock(&new_map->lock);
+               vm_map_process_deferred();
+               vmspace_free(vm2);
+               return (NULL);
+       }
+
        new_map->anon_loc = old_map->anon_loc;
+
        old_entry = old_map->header.next;
 
        while (old_entry != &old_map->header) {

Modified: head/sys/x86/include/sysarch.h
==============================================================================
--- head/sys/x86/include/sysarch.h      Wed Feb 20 09:46:44 2019        
(r344352)
+++ head/sys/x86/include/sysarch.h      Wed Feb 20 09:51:13 2019        
(r344353)
@@ -52,6 +52,8 @@
 #define        I386_GET_GSBASE         9
 #define        I386_SET_GSBASE         10
 #define        I386_GET_XFPUSTATE      11
+#define        I386_SET_PKRU           12
+#define        I386_CLEAR_PKRU         13
 
 /* Leave space for 0-127 for to avoid translating syscalls */
 #define        AMD64_GET_FSBASE        128
@@ -59,7 +61,13 @@
 #define        AMD64_GET_GSBASE        130
 #define        AMD64_SET_GSBASE        131
 #define        AMD64_GET_XFPUSTATE     132
+#define        AMD64_SET_PKRU          133
+#define        AMD64_CLEAR_PKRU        134
 
+/* Flags for AMD64_SET_PKRU */
+#define        AMD64_PKRU_EXCL         0x0001
+#define        AMD64_PKRU_PERSIST      0x0002
+
 struct i386_ioperm_args {
        unsigned int start;
        unsigned int length;
@@ -94,11 +102,25 @@ struct i386_get_xfpustate {
        int len;
 };
 
+struct i386_set_pkru {
+       unsigned int addr;
+       unsigned int len;
+       unsigned int keyidx;
+       int flags;
+};
+
 struct amd64_get_xfpustate {
        void *addr;
        int len;
 };
 #endif
+
+struct amd64_set_pkru {
+       void *addr;
+       unsigned long len;
+       unsigned int keyidx;
+       int flags;
+};
 
 #ifndef _KERNEL
 union descriptor;
_______________________________________________
[email protected] mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "[email protected]"

Reply via email to