On Thu 25-07-13 18:25:35, Johannes Weiner wrote:
> Unlike global OOM handling, memory cgroup code will invoke the OOM
> killer in any OOM situation because it has no way of telling faults
> occuring in kernel context - which could be handled more gracefully -
> from user-triggered faults.
> 
> Pass a flag that identifies faults originating in user space from the
> architecture-specific fault handlers to generic code so that memcg OOM
> handling can be improved.
> 
> Signed-off-by: Johannes Weiner <[email protected]>

Looks good to me but I guess maintainers of the affected archs should be
CCed
Reviewed-by: Michal Hocko <[email protected]>

> ---
>  arch/alpha/mm/fault.c      |  7 ++++---
>  arch/arc/mm/fault.c        |  6 ++++--
>  arch/arm/mm/fault.c        |  9 ++++++---
>  arch/arm64/mm/fault.c      |  9 ++++++---
>  arch/avr32/mm/fault.c      |  2 ++
>  arch/cris/mm/fault.c       |  6 ++++--
>  arch/frv/mm/fault.c        | 10 ++++++----
>  arch/hexagon/mm/vm_fault.c |  6 ++++--
>  arch/ia64/mm/fault.c       |  6 ++++--
>  arch/m32r/mm/fault.c       | 10 ++++++----
>  arch/m68k/mm/fault.c       |  2 ++
>  arch/metag/mm/fault.c      |  6 ++++--
>  arch/microblaze/mm/fault.c |  7 +++++--
>  arch/mips/mm/fault.c       |  6 ++++--
>  arch/mn10300/mm/fault.c    |  2 ++
>  arch/openrisc/mm/fault.c   |  1 +
>  arch/parisc/mm/fault.c     |  7 +++++--
>  arch/powerpc/mm/fault.c    |  7 ++++---
>  arch/s390/mm/fault.c       |  2 ++
>  arch/score/mm/fault.c      |  7 ++++++-
>  arch/sh/mm/fault.c         |  9 ++++++---
>  arch/sparc/mm/fault_32.c   | 12 +++++++++---
>  arch/sparc/mm/fault_64.c   |  8 +++++---
>  arch/tile/mm/fault.c       |  7 +++++--
>  arch/um/kernel/trap.c      | 20 ++++++++++++--------
>  arch/unicore32/mm/fault.c  |  8 ++++++--
>  arch/x86/mm/fault.c        |  8 +++++---
>  arch/xtensa/mm/fault.c     |  2 ++
>  include/linux/mm.h         |  1 +
>  29 files changed, 132 insertions(+), 61 deletions(-)
> 
> diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
> index 0c4132d..98838a0 100644
> --- a/arch/alpha/mm/fault.c
> +++ b/arch/alpha/mm/fault.c
> @@ -89,8 +89,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
>       const struct exception_table_entry *fixup;
>       int fault, si_code = SEGV_MAPERR;
>       siginfo_t info;
> -     unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> -                           (cause > 0 ? FAULT_FLAG_WRITE : 0));
> +     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>  
>       /* As of EV6, a load into $31/$f31 is a prefetch, and never faults
>          (or is suppressed by the PALcode).  Support that for older CPUs
> @@ -115,7 +114,8 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
>       if (address >= TASK_SIZE)
>               goto vmalloc_fault;
>  #endif
> -
> +     if (user_mode(regs))
> +             flags |= FAULT_FLAG_USER;
>  retry:
>       down_read(&mm->mmap_sem);
>       vma = find_vma(mm, address);
> @@ -142,6 +142,7 @@ retry:
>       } else {
>               if (!(vma->vm_flags & VM_WRITE))
>                       goto bad_area;
> +             flags |= FAULT_FLAG_WRITE;
>       }
>  
>       /* If for any reason at all we couldn't handle the fault,
> diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
> index 6b0bb41..d63f3de 100644
> --- a/arch/arc/mm/fault.c
> +++ b/arch/arc/mm/fault.c
> @@ -60,8 +60,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long 
> address)
>       siginfo_t info;
>       int fault, ret;
>       int write = regs->ecr_cause & ECR_C_PROTV_STORE;  /* ST/EX */
> -     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> -                             (write ? FAULT_FLAG_WRITE : 0);
> +     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>  
>       /*
>        * We fault-in kernel-space virtual memory on-demand. The
> @@ -89,6 +88,8 @@ void do_page_fault(struct pt_regs *regs, unsigned long 
> address)
>       if (in_atomic() || !mm)
>               goto no_context;
>  
> +     if (user_mode(regs))
> +             flags |= FAULT_FLAG_USER;
>  retry:
>       down_read(&mm->mmap_sem);
>       vma = find_vma(mm, address);
> @@ -117,6 +118,7 @@ good_area:
>       if (write) {
>               if (!(vma->vm_flags & VM_WRITE))
>                       goto bad_area;
> +             flags |= FAULT_FLAG_WRITE;
>       } else {
>               if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
>                       goto bad_area;
> diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
> index 217bcbf..eb8830a 100644
> --- a/arch/arm/mm/fault.c
> +++ b/arch/arm/mm/fault.c
> @@ -261,9 +261,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, 
> struct pt_regs *regs)
>       struct task_struct *tsk;
>       struct mm_struct *mm;
>       int fault, sig, code;
> -     int write = fsr & FSR_WRITE;
> -     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> -                             (write ? FAULT_FLAG_WRITE : 0);
> +     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>  
>       if (notify_page_fault(regs, fsr))
>               return 0;
> @@ -282,6 +280,11 @@ do_page_fault(unsigned long addr, unsigned int fsr, 
> struct pt_regs *regs)
>       if (in_atomic() || !mm)
>               goto no_context;
>  
> +     if (user_mode(regs))
> +             flags |= FAULT_FLAG_USER;
> +     if (fsr & FSR_WRITE)
> +             flags |= FAULT_FLAG_WRITE;
> +
>       /*
>        * As per x86, we may deadlock here.  However, since the kernel only
>        * validly references user space from well defined areas of the code,
> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
> index dab1cfd..12205b4 100644
> --- a/arch/arm64/mm/fault.c
> +++ b/arch/arm64/mm/fault.c
> @@ -208,9 +208,7 @@ static int __kprobes do_page_fault(unsigned long addr, 
> unsigned int esr,
>       struct task_struct *tsk;
>       struct mm_struct *mm;
>       int fault, sig, code;
> -     bool write = (esr & ESR_WRITE) && !(esr & ESR_CM);
> -     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> -             (write ? FAULT_FLAG_WRITE : 0);
> +     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>  
>       tsk = current;
>       mm  = tsk->mm;
> @@ -226,6 +224,11 @@ static int __kprobes do_page_fault(unsigned long addr, 
> unsigned int esr,
>       if (in_atomic() || !mm)
>               goto no_context;
>  
> +     if (user_mode(regs))
> +             flags |= FAULT_FLAG_USER;
> +     if ((esr & ESR_WRITE) && !(esr & ESR_CM))
> +             flags |= FAULT_FLAG_WRITE;
> +
>       /*
>        * As per x86, we may deadlock here. However, since the kernel only
>        * validly references user space from well defined areas of the code,
> diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
> index 2ca27b0..0eca933 100644
> --- a/arch/avr32/mm/fault.c
> +++ b/arch/avr32/mm/fault.c
> @@ -86,6 +86,8 @@ asmlinkage void do_page_fault(unsigned long ecr, struct 
> pt_regs *regs)
>  
>       local_irq_enable();
>  
> +     if (user_mode(regs))
> +             flags |= FAULT_FLAG_USER;
>  retry:
>       down_read(&mm->mmap_sem);
>  
> diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
> index 73312ab..1790f22 100644
> --- a/arch/cris/mm/fault.c
> +++ b/arch/cris/mm/fault.c
> @@ -58,8 +58,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
>       struct vm_area_struct * vma;
>       siginfo_t info;
>       int fault;
> -     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> -                             ((writeaccess & 1) ? FAULT_FLAG_WRITE : 0);
> +     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>  
>       D(printk(KERN_DEBUG
>                "Page fault for %lX on %X at %lX, prot %d write %d\n",
> @@ -117,6 +116,8 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
>       if (in_atomic() || !mm)
>               goto no_context;
>  
> +     if (user_mode(regs))
> +             flags |= FAULT_FLAG_USER;
>  retry:
>       down_read(&mm->mmap_sem);
>       vma = find_vma(mm, address);
> @@ -155,6 +156,7 @@ retry:
>       } else if (writeaccess == 1) {
>               if (!(vma->vm_flags & VM_WRITE))
>                       goto bad_area;
> +             flags |= FAULT_FLAG_WRITE;
>       } else {
>               if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
>                       goto bad_area;
> diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c
> index 331c1e2..9a66372 100644
> --- a/arch/frv/mm/fault.c
> +++ b/arch/frv/mm/fault.c
> @@ -34,11 +34,11 @@ asmlinkage void do_page_fault(int datammu, unsigned long 
> esr0, unsigned long ear
>       struct vm_area_struct *vma;
>       struct mm_struct *mm;
>       unsigned long _pme, lrai, lrad, fixup;
> +     unsigned long flags = 0;
>       siginfo_t info;
>       pgd_t *pge;
>       pud_t *pue;
>       pte_t *pte;
> -     int write;
>       int fault;
>  
>  #if 0
> @@ -81,6 +81,9 @@ asmlinkage void do_page_fault(int datammu, unsigned long 
> esr0, unsigned long ear
>       if (in_atomic() || !mm)
>               goto no_context;
>  
> +     if (user_mode(__frame))
> +             flags |= FAULT_FLAG_USER;
> +
>       down_read(&mm->mmap_sem);
>  
>       vma = find_vma(mm, ear0);
> @@ -129,7 +132,6 @@ asmlinkage void do_page_fault(int datammu, unsigned long 
> esr0, unsigned long ear
>   */
>   good_area:
>       info.si_code = SEGV_ACCERR;
> -     write = 0;
>       switch (esr0 & ESR0_ATXC) {
>       default:
>               /* handle write to write protected page */
> @@ -140,7 +142,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long 
> esr0, unsigned long ear
>  #endif
>               if (!(vma->vm_flags & VM_WRITE))
>                       goto bad_area;
> -             write = 1;
> +             flags |= FAULT_FLAG_WRITE;
>               break;
>  
>                /* handle read from protected page */
> @@ -162,7 +164,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long 
> esr0, unsigned long ear
>        * make sure we exit gracefully rather than endlessly redo
>        * the fault.
>        */
> -     fault = handle_mm_fault(mm, vma, ear0, write ? FAULT_FLAG_WRITE : 0);
> +     fault = handle_mm_fault(mm, vma, ear0, flags);
>       if (unlikely(fault & VM_FAULT_ERROR)) {
>               if (fault & VM_FAULT_OOM)
>                       goto out_of_memory;
> diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c
> index 1bd276d..8704c93 100644
> --- a/arch/hexagon/mm/vm_fault.c
> +++ b/arch/hexagon/mm/vm_fault.c
> @@ -53,8 +53,7 @@ void do_page_fault(unsigned long address, long cause, 
> struct pt_regs *regs)
>       int si_code = SEGV_MAPERR;
>       int fault;
>       const struct exception_table_entry *fixup;
> -     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> -                              (cause > 0 ? FAULT_FLAG_WRITE : 0);
> +     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>  
>       /*
>        * If we're in an interrupt or have no user context,
> @@ -65,6 +64,8 @@ void do_page_fault(unsigned long address, long cause, 
> struct pt_regs *regs)
>  
>       local_irq_enable();
>  
> +     if (user_mode(regs))
> +             flags |= FAULT_FLAG_USER;
>  retry:
>       down_read(&mm->mmap_sem);
>       vma = find_vma(mm, address);
> @@ -96,6 +97,7 @@ good_area:
>       case FLT_STORE:
>               if (!(vma->vm_flags & VM_WRITE))
>                       goto bad_area;
> +             flags |= FAULT_FLAG_WRITE;
>               break;
>       }
>  
> diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
> index 6cf0341..7225dad 100644
> --- a/arch/ia64/mm/fault.c
> +++ b/arch/ia64/mm/fault.c
> @@ -90,8 +90,6 @@ ia64_do_page_fault (unsigned long address, unsigned long 
> isr, struct pt_regs *re
>       mask = ((((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT)
>               | (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT));
>  
> -     flags |= ((mask & VM_WRITE) ? FAULT_FLAG_WRITE : 0);
> -
>       /* mmap_sem is performance critical.... */
>       prefetchw(&mm->mmap_sem);
>  
> @@ -119,6 +117,10 @@ ia64_do_page_fault (unsigned long address, unsigned long 
> isr, struct pt_regs *re
>       if (notify_page_fault(regs, TRAP_BRKPT))
>               return;
>  
> +     if (user_mode(regs))
> +             flags |= FAULT_FLAG_USER;
> +     if (mask & VM_WRITE)
> +             flags |= FAULT_FLAG_WRITE;
>  retry:
>       down_read(&mm->mmap_sem);
>  
> diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c
> index 3cdfa9c..e9c6a80 100644
> --- a/arch/m32r/mm/fault.c
> +++ b/arch/m32r/mm/fault.c
> @@ -78,7 +78,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, 
> unsigned long error_code,
>       struct mm_struct *mm;
>       struct vm_area_struct * vma;
>       unsigned long page, addr;
> -     int write;
> +     unsigned long flags = 0;
>       int fault;
>       siginfo_t info;
>  
> @@ -117,6 +117,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, 
> unsigned long error_code,
>       if (in_atomic() || !mm)
>               goto bad_area_nosemaphore;
>  
> +     if (error_code & ACE_USERMODE)
> +             flags |= FAULT_FLAG_USER;
> +
>       /* When running in the kernel we expect faults to occur only to
>        * addresses in user space.  All other faults represent errors in the
>        * kernel and should generate an OOPS.  Unfortunately, in the case of an
> @@ -166,14 +169,13 @@ asmlinkage void do_page_fault(struct pt_regs *regs, 
> unsigned long error_code,
>   */
>  good_area:
>       info.si_code = SEGV_ACCERR;
> -     write = 0;
>       switch (error_code & (ACE_WRITE|ACE_PROTECTION)) {
>               default:        /* 3: write, present */
>                       /* fall through */
>               case ACE_WRITE: /* write, not present */
>                       if (!(vma->vm_flags & VM_WRITE))
>                               goto bad_area;
> -                     write++;
> +                     flags |= FAULT_FLAG_WRITE;
>                       break;
>               case ACE_PROTECTION:    /* read, present */
>               case 0:         /* read, not present */
> @@ -194,7 +196,7 @@ good_area:
>        */
>       addr = (address & PAGE_MASK);
>       set_thread_fault_code(error_code);
> -     fault = handle_mm_fault(mm, vma, addr, write ? FAULT_FLAG_WRITE : 0);
> +     fault = handle_mm_fault(mm, vma, addr, flags);
>       if (unlikely(fault & VM_FAULT_ERROR)) {
>               if (fault & VM_FAULT_OOM)
>                       goto out_of_memory;
> diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
> index a563727..eb1d61f 100644
> --- a/arch/m68k/mm/fault.c
> +++ b/arch/m68k/mm/fault.c
> @@ -88,6 +88,8 @@ int do_page_fault(struct pt_regs *regs, unsigned long 
> address,
>       if (in_atomic() || !mm)
>               goto no_context;
>  
> +     if (user_mode(regs))
> +             flags |= FAULT_FLAG_USER;
>  retry:
>       down_read(&mm->mmap_sem);
>  
> diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c
> index 8fddf46..332680e 100644
> --- a/arch/metag/mm/fault.c
> +++ b/arch/metag/mm/fault.c
> @@ -53,8 +53,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long 
> address,
>       struct vm_area_struct *vma, *prev_vma;
>       siginfo_t info;
>       int fault;
> -     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> -                             (write_access ? FAULT_FLAG_WRITE : 0);
> +     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>  
>       tsk = current;
>  
> @@ -109,6 +108,8 @@ int do_page_fault(struct pt_regs *regs, unsigned long 
> address,
>       if (in_atomic() || !mm)
>               goto no_context;
>  
> +     if (user_mode(regs))
> +             flags |= FAULT_FLAG_USER;
>  retry:
>       down_read(&mm->mmap_sem);
>  
> @@ -121,6 +122,7 @@ good_area:
>       if (write_access) {
>               if (!(vma->vm_flags & VM_WRITE))
>                       goto bad_area;
> +             flags |= FAULT_FLAG_WRITE;
>       } else {
>               if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
>                       goto bad_area;
> diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
> index 731f739..fa4cf52 100644
> --- a/arch/microblaze/mm/fault.c
> +++ b/arch/microblaze/mm/fault.c
> @@ -92,8 +92,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long 
> address,
>       int code = SEGV_MAPERR;
>       int is_write = error_code & ESR_S;
>       int fault;
> -     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> -                                      (is_write ? FAULT_FLAG_WRITE : 0);
> +     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>  
>       regs->ear = address;
>       regs->esr = error_code;
> @@ -121,6 +120,9 @@ void do_page_fault(struct pt_regs *regs, unsigned long 
> address,
>               die("Weird page fault", regs, SIGSEGV);
>       }
>  
> +     if (user_mode(regs))
> +             flags |= FAULT_FLAG_USER;
> +
>       /* When running in the kernel we expect faults to occur only to
>        * addresses in user space.  All other faults represent errors in the
>        * kernel and should generate an OOPS.  Unfortunately, in the case of an
> @@ -199,6 +201,7 @@ good_area:
>       if (unlikely(is_write)) {
>               if (unlikely(!(vma->vm_flags & VM_WRITE)))
>                       goto bad_area;
> +             flags |= FAULT_FLAG_WRITE;
>       /* a read */
>       } else {
>               /* protection fault */
> diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
> index 94d3a31..becc42b 100644
> --- a/arch/mips/mm/fault.c
> +++ b/arch/mips/mm/fault.c
> @@ -42,8 +42,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, 
> unsigned long write,
>       const int field = sizeof(unsigned long) * 2;
>       siginfo_t info;
>       int fault;
> -     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> -                                              (write ? FAULT_FLAG_WRITE : 0);
> +     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>  
>  #if 0
>       printk("Cpu%d[%s:%d:%0*lx:%ld:%0*lx]\n", raw_smp_processor_id(),
> @@ -93,6 +92,8 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, 
> unsigned long write,
>       if (in_atomic() || !mm)
>               goto bad_area_nosemaphore;
>  
> +     if (user_mode(regs))
> +             flags |= FAULT_FLAG_USER;
>  retry:
>       down_read(&mm->mmap_sem);
>       vma = find_vma(mm, address);
> @@ -114,6 +115,7 @@ good_area:
>       if (write) {
>               if (!(vma->vm_flags & VM_WRITE))
>                       goto bad_area;
> +             flags |= FAULT_FLAG_WRITE;
>       } else {
>               if (cpu_has_rixi) {
>                       if (address == regs->cp0_epc && !(vma->vm_flags & 
> VM_EXEC)) {
> diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
> index 8a2e6de..3516cbd 100644
> --- a/arch/mn10300/mm/fault.c
> +++ b/arch/mn10300/mm/fault.c
> @@ -171,6 +171,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs, 
> unsigned long fault_code,
>       if (in_atomic() || !mm)
>               goto no_context;
>  
> +     if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR)
> +             flags |= FAULT_FLAG_USER;
>  retry:
>       down_read(&mm->mmap_sem);
>  
> diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
> index 4a41f84..0703acf 100644
> --- a/arch/openrisc/mm/fault.c
> +++ b/arch/openrisc/mm/fault.c
> @@ -86,6 +86,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, 
> unsigned long address,
>       if (user_mode(regs)) {
>               /* Exception was in userspace: reenable interrupts */
>               local_irq_enable();
> +             flags |= FAULT_FLAG_USER;
>       } else {
>               /* If exception was in a syscall, then IRQ's may have
>                * been enabled or disabled.  If they were enabled,
> diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
> index f247a34..d10d27a 100644
> --- a/arch/parisc/mm/fault.c
> +++ b/arch/parisc/mm/fault.c
> @@ -180,6 +180,10 @@ void do_page_fault(struct pt_regs *regs, unsigned long 
> code,
>       if (in_atomic() || !mm)
>               goto no_context;
>  
> +     if (user_mode(regs))
> +             flags |= FAULT_FLAG_USER;
> +     if (acc_type & VM_WRITE)
> +             flags |= FAULT_FLAG_WRITE;
>  retry:
>       down_read(&mm->mmap_sem);
>       vma = find_vma_prev(mm, address, &prev_vma);
> @@ -203,8 +207,7 @@ good_area:
>        * fault.
>        */
>  
> -     fault = handle_mm_fault(mm, vma, address,
> -                     flags | ((acc_type & VM_WRITE) ? FAULT_FLAG_WRITE : 0));
> +     fault = handle_mm_fault(mm, vma, address, flags);
>  
>       if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
>               return;
> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> index 8726779..d9196c9 100644
> --- a/arch/powerpc/mm/fault.c
> +++ b/arch/powerpc/mm/fault.c
> @@ -223,9 +223,6 @@ int __kprobes do_page_fault(struct pt_regs *regs, 
> unsigned long address,
>       is_write = error_code & ESR_DST;
>  #endif /* CONFIG_4xx || CONFIG_BOOKE */
>  
> -     if (is_write)
> -             flags |= FAULT_FLAG_WRITE;
> -
>  #ifdef CONFIG_PPC_ICSWX
>       /*
>        * we need to do this early because this "data storage
> @@ -280,6 +277,9 @@ int __kprobes do_page_fault(struct pt_regs *regs, 
> unsigned long address,
>  
>       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
>  
> +     if (user_mode(regs))
> +             flags |= FAULT_FLAG_USER;
> +
>       /* When running in the kernel we expect faults to occur only to
>        * addresses in user space.  All other faults represent errors in the
>        * kernel and should generate an OOPS.  Unfortunately, in the case of an
> @@ -408,6 +408,7 @@ good_area:
>       } else if (is_write) {
>               if (!(vma->vm_flags & VM_WRITE))
>                       goto bad_area;
> +             flags |= FAULT_FLAG_WRITE;
>       /* a read */
>       } else {
>               /* protection fault */
> diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
> index f00aefb..6fa7b05 100644
> --- a/arch/s390/mm/fault.c
> +++ b/arch/s390/mm/fault.c
> @@ -302,6 +302,8 @@ static inline int do_exception(struct pt_regs *regs, int 
> access)
>       address = trans_exc_code & __FAIL_ADDR_MASK;
>       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
>       flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
> +     if (regs->psw.mask & PSW_MASK_PSTATE)
> +             flags |= FAULT_FLAG_USER;
>       if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
>               flags |= FAULT_FLAG_WRITE;
>       down_read(&mm->mmap_sem);
> diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c
> index 4b71a62..52238983 100644
> --- a/arch/score/mm/fault.c
> +++ b/arch/score/mm/fault.c
> @@ -47,6 +47,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, 
> unsigned long write,
>       struct task_struct *tsk = current;
>       struct mm_struct *mm = tsk->mm;
>       const int field = sizeof(unsigned long) * 2;
> +     unsigned long flags = 0;
>       siginfo_t info;
>       int fault;
>  
> @@ -75,6 +76,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, 
> unsigned long write,
>       if (in_atomic() || !mm)
>               goto bad_area_nosemaphore;
>  
> +     if (user_mode(regs))
> +             flags |= FAULT_FLAG_USER;
> +
>       down_read(&mm->mmap_sem);
>       vma = find_vma(mm, address);
>       if (!vma)
> @@ -95,6 +99,7 @@ good_area:
>       if (write) {
>               if (!(vma->vm_flags & VM_WRITE))
>                       goto bad_area;
> +             flags |= FAULT_FLAG_WRITE;
>       } else {
>               if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)))
>                       goto bad_area;
> @@ -105,7 +110,7 @@ good_area:
>       * make sure we exit gracefully rather than endlessly redo
>       * the fault.
>       */
> -     fault = handle_mm_fault(mm, vma, address, write);
> +     fault = handle_mm_fault(mm, vma, address, flags);
>       if (unlikely(fault & VM_FAULT_ERROR)) {
>               if (fault & VM_FAULT_OOM)
>                       goto out_of_memory;
> diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
> index 1f49c28..541dc61 100644
> --- a/arch/sh/mm/fault.c
> +++ b/arch/sh/mm/fault.c
> @@ -400,9 +400,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs 
> *regs,
>       struct mm_struct *mm;
>       struct vm_area_struct * vma;
>       int fault;
> -     int write = error_code & FAULT_CODE_WRITE;
> -     unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> -                           (write ? FAULT_FLAG_WRITE : 0));
> +     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>  
>       tsk = current;
>       mm = tsk->mm;
> @@ -476,6 +474,11 @@ good_area:
>  
>       set_thread_fault_code(error_code);
>  
> +     if (user_mode(regs))
> +             flags |= FAULT_FLAG_USER;
> +     if (error_code & FAULT_CODE_WRITE)
> +             flags |= FAULT_FLAG_WRITE;
> +
>       /*
>        * If for any reason at all we couldn't handle the fault,
>        * make sure we exit gracefully rather than endlessly redo
> diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
> index e98bfda..59dbd46 100644
> --- a/arch/sparc/mm/fault_32.c
> +++ b/arch/sparc/mm/fault_32.c
> @@ -177,8 +177,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int 
> text_fault, int write,
>       unsigned long g2;
>       int from_user = !(regs->psr & PSR_PS);
>       int fault, code;
> -     unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> -                           (write ? FAULT_FLAG_WRITE : 0));
> +     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>  
>       if (text_fault)
>               address = regs->pc;
> @@ -235,6 +234,11 @@ good_area:
>                       goto bad_area;
>       }
>  
> +     if (from_user)
> +             flags |= FAULT_FLAG_USER;
> +     if (write)
> +             flags |= FAULT_FLAG_WRITE;
> +
>       /*
>        * If for any reason at all we couldn't handle the fault,
>        * make sure we exit gracefully rather than endlessly redo
> @@ -383,6 +387,7 @@ static void force_user_fault(unsigned long address, int 
> write)
>       struct vm_area_struct *vma;
>       struct task_struct *tsk = current;
>       struct mm_struct *mm = tsk->mm;
> +     unsigned int flags = FAULT_FLAG_USER;
>       int code;
>  
>       code = SEGV_MAPERR;
> @@ -402,11 +407,12 @@ good_area:
>       if (write) {
>               if (!(vma->vm_flags & VM_WRITE))
>                       goto bad_area;
> +             flags |= FAULT_FLAG_WRITE;
>       } else {
>               if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
>                       goto bad_area;
>       }
> -     switch (handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 
> 0)) {
> +     switch (handle_mm_fault(mm, vma, address, flags)) {
>       case VM_FAULT_SIGBUS:
>       case VM_FAULT_OOM:
>               goto do_sigbus;
> diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
> index 5062ff3..c08b9bb 100644
> --- a/arch/sparc/mm/fault_64.c
> +++ b/arch/sparc/mm/fault_64.c
> @@ -314,8 +314,9 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs 
> *regs)
>               } else {
>                       bad_kernel_pc(regs, address);
>                       return;
> -             }
> -     }
> +             }               
> +     } else
> +             flags |= FAULT_FLAG_USER;
>  
>       /*
>        * If we're in an interrupt or have no user
> @@ -418,13 +419,14 @@ good_area:
>                   vma->vm_file != NULL)
>                       set_thread_fault_code(fault_code |
>                                             FAULT_CODE_BLKCOMMIT);
> +
> +             flags |= FAULT_FLAG_WRITE;
>       } else {
>               /* Allow reads even for write-only mappings */
>               if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
>                       goto bad_area;
>       }
>  
> -     flags |= ((fault_code & FAULT_CODE_WRITE) ? FAULT_FLAG_WRITE : 0);
>       fault = handle_mm_fault(mm, vma, address, flags);
>  
>       if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
> diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
> index ac553ee..3ff289f 100644
> --- a/arch/tile/mm/fault.c
> +++ b/arch/tile/mm/fault.c
> @@ -280,8 +280,7 @@ static int handle_page_fault(struct pt_regs *regs,
>       if (!is_page_fault)
>               write = 1;
>  
> -     flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> -              (write ? FAULT_FLAG_WRITE : 0));
> +     flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>  
>       is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL);
>  
> @@ -365,6 +364,9 @@ static int handle_page_fault(struct pt_regs *regs,
>               goto bad_area_nosemaphore;
>       }
>  
> +     if (!is_kernel_mode)
> +             flags |= FAULT_FLAG_USER;
> +
>       /*
>        * When running in the kernel we expect faults to occur only to
>        * addresses in user space.  All other faults represent errors in the
> @@ -425,6 +427,7 @@ good_area:
>  #endif
>               if (!(vma->vm_flags & VM_WRITE))
>                       goto bad_area;
> +             flags |= FAULT_FLAG_WRITE;
>       } else {
>               if (!is_page_fault || !(vma->vm_flags & VM_READ))
>                       goto bad_area;
> diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
> index b2f5adf..5c3aef7 100644
> --- a/arch/um/kernel/trap.c
> +++ b/arch/um/kernel/trap.c
> @@ -30,8 +30,7 @@ int handle_page_fault(unsigned long address, unsigned long 
> ip,
>       pmd_t *pmd;
>       pte_t *pte;
>       int err = -EFAULT;
> -     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> -                              (is_write ? FAULT_FLAG_WRITE : 0);
> +     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>  
>       *code_out = SEGV_MAPERR;
>  
> @@ -42,6 +41,8 @@ int handle_page_fault(unsigned long address, unsigned long 
> ip,
>       if (in_atomic())
>               goto out_nosemaphore;
>  
> +     if (is_user)
> +             flags |= FAULT_FLAG_USER;
>  retry:
>       down_read(&mm->mmap_sem);
>       vma = find_vma(mm, address);
> @@ -58,12 +59,15 @@ retry:
>  
>  good_area:
>       *code_out = SEGV_ACCERR;
> -     if (is_write && !(vma->vm_flags & VM_WRITE))
> -             goto out;
> -
> -     /* Don't require VM_READ|VM_EXEC for write faults! */
> -     if (!is_write && !(vma->vm_flags & (VM_READ | VM_EXEC)))
> -             goto out;
> +     if (is_write) {
> +             if (!(vma->vm_flags & VM_WRITE))
> +                     goto out;
> +             flags |= FAULT_FLAG_WRITE;
> +     } else {
> +             /* Don't require VM_READ|VM_EXEC for write faults! */
> +             if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
> +                     goto out;
> +     }
>  
>       do {
>               int fault;
> diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c
> index 8ed3c45..0dc922d 100644
> --- a/arch/unicore32/mm/fault.c
> +++ b/arch/unicore32/mm/fault.c
> @@ -209,8 +209,7 @@ static int do_pf(unsigned long addr, unsigned int fsr, 
> struct pt_regs *regs)
>       struct task_struct *tsk;
>       struct mm_struct *mm;
>       int fault, sig, code;
> -     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> -                              ((!(fsr ^ 0x12)) ? FAULT_FLAG_WRITE : 0);
> +     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>  
>       tsk = current;
>       mm = tsk->mm;
> @@ -222,6 +221,11 @@ static int do_pf(unsigned long addr, unsigned int fsr, 
> struct pt_regs *regs)
>       if (in_atomic() || !mm)
>               goto no_context;
>  
> +     if (user_mode(regs))
> +             flags |= FAULT_FLAG_USER;
> +     if (!(fsr ^ 0x12))
> +             flags |= FAULT_FLAG_WRITE;
> +
>       /*
>        * As per x86, we may deadlock here.  However, since the kernel only
>        * validly references user space from well defined areas of the code,
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index 654be4a..6d77c38 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -1011,9 +1011,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
> error_code)
>       unsigned long address;
>       struct mm_struct *mm;
>       int fault;
> -     int write = error_code & PF_WRITE;
> -     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> -                                     (write ? FAULT_FLAG_WRITE : 0);
> +     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>  
>       tsk = current;
>       mm = tsk->mm;
> @@ -1083,6 +1081,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
> error_code)
>       if (user_mode_vm(regs)) {
>               local_irq_enable();
>               error_code |= PF_USER;
> +             flags |= FAULT_FLAG_USER;
>       } else {
>               if (regs->flags & X86_EFLAGS_IF)
>                       local_irq_enable();
> @@ -1109,6 +1108,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
> error_code)
>               return;
>       }
>  
> +     if (error_code & PF_WRITE)
> +             flags |= FAULT_FLAG_WRITE;
> +
>       /*
>        * When running in the kernel we expect faults to occur only to
>        * addresses in user space.  All other faults represent errors in
> diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
> index 4b7bc8d..70fa7bc 100644
> --- a/arch/xtensa/mm/fault.c
> +++ b/arch/xtensa/mm/fault.c
> @@ -72,6 +72,8 @@ void do_page_fault(struct pt_regs *regs)
>              address, exccause, regs->pc, is_write? "w":"", is_exec? "x":"");
>  #endif
>  
> +     if (user_mode(regs))
> +             flags |= FAULT_FLAG_USER;
>  retry:
>       down_read(&mm->mmap_sem);
>       vma = find_vma(mm, address);
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index d5c82dc..c51fc32 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -170,6 +170,7 @@ extern pgprot_t protection_map[16];
>  #define FAULT_FLAG_RETRY_NOWAIT      0x10    /* Don't drop mmap_sem and wait 
> when retrying */
>  #define FAULT_FLAG_KILLABLE  0x20    /* The fault task is in SIGKILL 
> killable region */
>  #define FAULT_FLAG_TRIED     0x40    /* second try */
> +#define FAULT_FLAG_USER              0x80    /* The fault originated in 
> userspace */
>  
>  /*
>   * vm_fault is filled by the the pagefault handler and passed to the vma's
> -- 
> 1.8.3.2
> 

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to