From: Dave Hansen <dave.han...@linux.intel.com>

We try to enforce protection keys in software the same way that we
do in hardware.  (See long example below).

But, we only want to do this when accessing our *own* process's
memory.  If GDB set PKRU[6].AD=1 (disable access to PKEY 6), then
tried to PTRACE_POKE a target process which just happened to have
some mprotect_pkey(pkey=6) memory, we do *not* want to deny the
debugger access to that memory.  PKRU is fundamentally a
thread-local structure and we do not want to enforce it on access
to _another_ thread's data.

This gets especially tricky when we have workqueues or other
delayed-work mechanisms that might run in a random process's context.
We can check that we only enforce pkeys when operating on our *own* mm,
but delayed work gets performed when a random user context is active.
We might end up with a situation where a delayed-work gup fails when
running randomly under its "own" task but succeeds when running under
another process.  We want to avoid that.

To avoid that, we add a GUP flag: FOLL_FOREIGN and a fault flag:
FAULT_FLAG_FOREIGN.  They indicate that we are walking an mm
which is not guranteed to be the same as current->mm and should
not be subject to protection key enforcement.

Thanks to Jerome Glisse for pointing out this scenario.

*** Why do we enforce protection keys in software?? ***

Imagine that we disabled access to the memory pointer to by 'buf'.
The, we implemented sys_write() like this:

        sys_read(fd, buf, len...)
        {
                struct page *page = follow_page(buf);
                void *buf_mapped = kmap(page);
                memcpy(buf_mapped, fd_data, len);
                ...
        }

This writes to 'buf' via a *kernel* mapping, without a protection
key.  While this implementation does the same thing:

        sys_read(fd, buf, len...)
        {
                copy_to_user(buf, fd_data, len);
                ...
        }

but would hit a protection key fault because the userspace 'buf'
mapping has a protection key set.

To provide consistency, and to make key-protected memory work
as much like mprotect()ed memory as possible, we try to enforce
the same protections as the hardware would when the *kernel* walks
the page tables (and other mm structures).

Signed-off-by: Dave Hansen <dave.han...@linux.intel.com>
Cc: linux-a...@vger.kernel.org
---

 b/arch/powerpc/include/asm/mmu_context.h   |    3 ++-
 b/arch/s390/include/asm/mmu_context.h      |    3 ++-
 b/arch/unicore32/include/asm/mmu_context.h |    3 ++-
 b/arch/x86/include/asm/mmu_context.h       |    5 +++--
 b/drivers/iommu/amd_iommu_v2.c             |    8 +++++---
 b/include/asm-generic/mm_hooks.h           |    3 ++-
 b/include/linux/mm.h                       |    2 ++
 b/mm/gup.c                                 |   15 ++++++++++-----
 b/mm/ksm.c                                 |   10 ++++++++--
 b/mm/memory.c                              |    3 ++-
 10 files changed, 38 insertions(+), 17 deletions(-)

diff -puN 
arch/powerpc/include/asm/mmu_context.h~pkeys-14-gup-fault-foreign-flag 
arch/powerpc/include/asm/mmu_context.h
--- a/arch/powerpc/include/asm/mmu_context.h~pkeys-14-gup-fault-foreign-flag    
2015-12-14 10:42:46.751979986 -0800
+++ b/arch/powerpc/include/asm/mmu_context.h    2015-12-14 10:42:46.769980793 
-0800
@@ -148,7 +148,8 @@ static inline void arch_bprm_mm_init(str
 {
 }
 
-static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool 
write)
+static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
+               bool write, bool foreign)
 {
        /* by default, allow everything */
        return true;
diff -puN arch/s390/include/asm/mmu_context.h~pkeys-14-gup-fault-foreign-flag 
arch/s390/include/asm/mmu_context.h
--- a/arch/s390/include/asm/mmu_context.h~pkeys-14-gup-fault-foreign-flag       
2015-12-14 10:42:46.753980076 -0800
+++ b/arch/s390/include/asm/mmu_context.h       2015-12-14 10:42:46.769980793 
-0800
@@ -130,7 +130,8 @@ static inline void arch_bprm_mm_init(str
 {
 }
 
-static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool 
write)
+static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
+               bool write, bool foreign)
 {
        /* by default, allow everything */
        return true;
diff -puN 
arch/unicore32/include/asm/mmu_context.h~pkeys-14-gup-fault-foreign-flag 
arch/unicore32/include/asm/mmu_context.h
--- a/arch/unicore32/include/asm/mmu_context.h~pkeys-14-gup-fault-foreign-flag  
2015-12-14 10:42:46.754980121 -0800
+++ b/arch/unicore32/include/asm/mmu_context.h  2015-12-14 10:42:46.770980838 
-0800
@@ -97,7 +97,8 @@ static inline void arch_bprm_mm_init(str
 {
 }
 
-static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool 
write)
+static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
+               bool write, bool foreign)
 {
        /* by default, allow everything */
        return true;
diff -puN arch/x86/include/asm/mmu_context.h~pkeys-14-gup-fault-foreign-flag 
arch/x86/include/asm/mmu_context.h
--- a/arch/x86/include/asm/mmu_context.h~pkeys-14-gup-fault-foreign-flag        
2015-12-14 10:42:46.756980210 -0800
+++ b/arch/x86/include/asm/mmu_context.h        2015-12-14 10:42:46.770980838 
-0800
@@ -290,10 +290,11 @@ static inline bool vma_is_foreign(struct
        return false;
 }
 
-static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool 
write)
+static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
+               bool write, bool foreign)
 {
        /* allow access if the VMA is not one from this process */
-       if (vma_is_foreign(vma))
+       if (foreign || vma_is_foreign(vma))
                return true;
        return __pkru_allows_pkey(vma_pkey(vma), write);
 }
diff -puN drivers/iommu/amd_iommu_v2.c~pkeys-14-gup-fault-foreign-flag 
drivers/iommu/amd_iommu_v2.c
--- a/drivers/iommu/amd_iommu_v2.c~pkeys-14-gup-fault-foreign-flag      
2015-12-14 10:42:46.757980255 -0800
+++ b/drivers/iommu/amd_iommu_v2.c      2015-12-14 10:42:46.771980883 -0800
@@ -500,9 +500,11 @@ static void do_fault(struct work_struct
        struct mm_struct *mm;
        struct vm_area_struct *vma;
        u64 address;
-       int ret, write;
+       int ret, flags;
 
-       write = !!(fault->flags & PPR_FAULT_WRITE);
+       if (fault->flags & PPR_FAULT_WRITE)
+               flags = FAULT_FLAG_WRITE;
+       flags |= FAULT_FLAG_FOREIGN;
 
        mm = fault->state->mm;
        address = fault->address;
@@ -523,7 +525,7 @@ static void do_fault(struct work_struct
                goto out;
        }
 
-       ret = handle_mm_fault(mm, vma, address, write);
+       ret = handle_mm_fault(mm, vma, address, flags);
        if (ret & VM_FAULT_ERROR) {
                /* failed to service fault */
                up_read(&mm->mmap_sem);
diff -puN include/asm-generic/mm_hooks.h~pkeys-14-gup-fault-foreign-flag 
include/asm-generic/mm_hooks.h
--- a/include/asm-generic/mm_hooks.h~pkeys-14-gup-fault-foreign-flag    
2015-12-14 10:42:46.759980345 -0800
+++ b/include/asm-generic/mm_hooks.h    2015-12-14 10:42:46.771980883 -0800
@@ -26,7 +26,8 @@ static inline void arch_bprm_mm_init(str
 {
 }
 
-static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool 
write)
+static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
+               bool write, bool foreign)
 {
        /* by default, allow everything */
        return true;
diff -puN include/linux/mm.h~pkeys-14-gup-fault-foreign-flag include/linux/mm.h
--- a/include/linux/mm.h~pkeys-14-gup-fault-foreign-flag        2015-12-14 
10:42:46.761980434 -0800
+++ b/include/linux/mm.h        2015-12-14 10:42:46.772980927 -0800
@@ -237,6 +237,7 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_KILLABLE    0x10    /* The fault task is in SIGKILL 
killable region */
 #define FAULT_FLAG_TRIED       0x20    /* Second try */
 #define FAULT_FLAG_USER                0x40    /* The fault originated in 
userspace */
+#define FAULT_FLAG_FOREIGN     0x80    /* faulting for non current tsk/mm */
 
 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
@@ -2143,6 +2144,7 @@ static inline struct page *follow_page(s
 #define FOLL_MIGRATION 0x400   /* wait for page to replace migration entry */
 #define FOLL_TRIED     0x800   /* a retry, previous pass started an IO */
 #define FOLL_MLOCK     0x1000  /* lock present pages */
+#define FOLL_FOREIGN   0x2000  /* we are working on non-current tsk/mm */
 
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
                        void *data);
diff -puN mm/gup.c~pkeys-14-gup-fault-foreign-flag mm/gup.c
--- a/mm/gup.c~pkeys-14-gup-fault-foreign-flag  2015-12-14 10:42:46.762980479 
-0800
+++ b/mm/gup.c  2015-12-14 10:42:46.772980927 -0800
@@ -310,6 +310,8 @@ static int faultin_page(struct task_stru
                return -ENOENT;
        if (*flags & FOLL_WRITE)
                fault_flags |= FAULT_FLAG_WRITE;
+       if (*flags & FOLL_FOREIGN)
+               fault_flags |= FAULT_FLAG_FOREIGN;
        if (nonblocking)
                fault_flags |= FAULT_FLAG_ALLOW_RETRY;
        if (*flags & FOLL_NOWAIT)
@@ -360,11 +362,13 @@ static int faultin_page(struct task_stru
 static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
 {
        vm_flags_t vm_flags = vma->vm_flags;
+       int write = (gup_flags & FOLL_WRITE);
+       int foreign = (gup_flags & FOLL_FOREIGN);
 
        if (vm_flags & (VM_IO | VM_PFNMAP))
                return -EFAULT;
 
-       if (gup_flags & FOLL_WRITE) {
+       if (write) {
                if (!(vm_flags & VM_WRITE)) {
                        if (!(gup_flags & FOLL_FORCE))
                                return -EFAULT;
@@ -392,7 +396,7 @@ static int check_vma_flags(struct vm_are
                if (!(vm_flags & VM_MAYREAD))
                        return -EFAULT;
        }
-       if (!arch_vma_access_permitted(vma, (gup_flags & FOLL_WRITE)))
+       if (!arch_vma_access_permitted(vma, write, foreign))
                return -EFAULT;
        return 0;
 }
@@ -562,7 +566,8 @@ EXPORT_SYMBOL(__get_user_pages);
 
 bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags)
 {
-       bool write = !!(fault_flags & FAULT_FLAG_WRITE);
+       bool write   = !!(fault_flags & FAULT_FLAG_WRITE);
+       bool foreign = !!(fault_flags & FAULT_FLAG_FOREIGN);
        vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
 
        if (!(vm_flags & vma->vm_flags))
@@ -570,9 +575,9 @@ bool vma_permits_fault(struct vm_area_st
 
        /*
         * The architecture might have a hardware protection
-        * mechanism other than read/write that can deny access
+        * mechanism other than read/write that can deny access.
         */
-       if (!arch_vma_access_permitted(vma, write))
+       if (!arch_vma_access_permitted(vma, write, foreign))
                return false;
 
        return true;
diff -puN mm/ksm.c~pkeys-14-gup-fault-foreign-flag mm/ksm.c
--- a/mm/ksm.c~pkeys-14-gup-fault-foreign-flag  2015-12-14 10:42:46.764980569 
-0800
+++ b/mm/ksm.c  2015-12-14 10:42:46.773980972 -0800
@@ -359,6 +359,10 @@ static inline bool ksm_test_exit(struct
  * in case the application has unmapped and remapped mm,addr meanwhile.
  * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
  * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
+ *
+ * FAULT_FLAG/FOLL_FOREIGN are because we do this outside the context
+ * of the process that owns 'vma'.  We also do not want to enforce
+ * protection keys here anyway.
  */
 static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
 {
@@ -367,12 +371,14 @@ static int break_ksm(struct vm_area_stru
 
        do {
                cond_resched();
-               page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION);
+               page = follow_page(vma, addr,
+                               FOLL_GET | FOLL_MIGRATION | FOLL_FOREIGN);
                if (IS_ERR_OR_NULL(page))
                        break;
                if (PageKsm(page))
                        ret = handle_mm_fault(vma->vm_mm, vma, addr,
-                                                       FAULT_FLAG_WRITE);
+                                                       FAULT_FLAG_WRITE |
+                                                       FAULT_FLAG_FOREIGN);
                else
                        ret = VM_FAULT_WRITE;
                put_page(page);
diff -puN mm/memory.c~pkeys-14-gup-fault-foreign-flag mm/memory.c
--- a/mm/memory.c~pkeys-14-gup-fault-foreign-flag       2015-12-14 
10:42:46.766980658 -0800
+++ b/mm/memory.c       2015-12-14 10:42:46.774981017 -0800
@@ -3345,7 +3345,8 @@ static int __handle_mm_fault(struct mm_s
        pmd_t *pmd;
        pte_t *pte;
 
-       if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE))
+       if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
+                                           flags & FAULT_FLAG_FOREIGN))
                return VM_FAULT_SIGSEGV;
 
        if (unlikely(is_vm_hugetlb_page(vma)))
_
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to