we saw a suspected spurious fault that failed VMA access check but passed spurious check for a user space address access of (rw,execut). The patch is trying to catch the case which might have indicated a stale TLB (software bug found) and a harmful spruious fault.
Signed-off-by: Luming Yu <luming...@intel.com> Signed-off-by: Yongkai Wu <yongka...@tencent.com> --- arch/x86/mm/fault.c | 95 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 54 insertions(+), 41 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 71d4b9d4d43f..5e2a49010d87 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1004,29 +1004,7 @@ static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte) return 1; } -/* - * Handle a spurious fault caused by a stale TLB entry. - * - * This allows us to lazily refresh the TLB when increasing the - * permissions of a kernel page (RO -> RW or NX -> X). Doing it - * eagerly is very expensive since that implies doing a full - * cross-processor TLB flush, even if no stale TLB entries exist - * on other processors. - * - * Spurious faults may only occur if the TLB contains an entry with - * fewer permission than the page table entry. Non-present (P = 0) - * and reserved bit (R = 1) faults are never spurious. - * - * There are no security implications to leaving a stale TLB when - * increasing the permissions on a page. - * - * Returns non-zero if a spurious fault was handled, zero otherwise. - * - * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3 - * (Optional Invalidation). - */ -static noinline int -spurious_kernel_fault(unsigned long error_code, unsigned long address) +static int spurious_page_table_check(unsigned long error_code, unsigned long address) { pgd_t *pgd; p4d_t *p4d; @@ -1035,19 +1013,6 @@ spurious_kernel_fault(unsigned long error_code, unsigned long address) pte_t *pte; int ret; - /* - * Only writes to RO or instruction fetches from NX may cause - * spurious faults. - * - * These could be from user or supervisor accesses but the TLB - * is only lazily flushed after a kernel mapping protection - * change, so user accesses are not expected to cause spurious - * faults. - */ - if (error_code != (X86_PF_WRITE | X86_PF_PROT) && - error_code != (X86_PF_INSTR | X86_PF_PROT)) - return 0; - pgd = init_mm.pgd + pgd_index(address); if (!pgd_present(*pgd)) return 0; @@ -1090,12 +1055,52 @@ spurious_kernel_fault(unsigned long error_code, unsigned long address) return ret; } + +/* + * Handle a spurious fault caused by a stale TLB entry. + * + * This allows us to lazily refresh the TLB when increasing the + * permissions of a kernel page (RO -> RW or NX -> X). Doing it + * eagerly is very expensive since that implies doing a full + * cross-processor TLB flush, even if no stale TLB entries exist + * on other processors. + * + * Spurious faults may only occur if the TLB contains an entry with + * fewer permission than the page table entry. Non-present (P = 0) + * and reserved bit (R = 1) faults are never spurious. + * + * There are no security implications to leaving a stale TLB when + * increasing the permissions on a page. + * + * Returns non-zero if a spurious fault was handled, zero otherwise. + * + * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3 + * (Optional Invalidation). + */ +static noinline int +spurious_kernel_fault(unsigned long error_code, unsigned long address) +{ + /* + * Only writes to RO or instruction fetches from NX may cause + * spurious faults. + * + * These could be from user or supervisor accesses but the TLB + * is only lazily flushed after a kernel mapping protection + * change, so user accesses are not expected to cause spurious + * faults. + */ + if (error_code != (X86_PF_WRITE | X86_PF_PROT) && + error_code != (X86_PF_INSTR | X86_PF_PROT)) + return 0; + + return spurious_page_table_check(error_code, address); +} NOKPROBE_SYMBOL(spurious_kernel_fault); int show_unhandled_signals = 1; static inline int -access_error(unsigned long error_code, struct vm_area_struct *vma) +access_error(unsigned long error_code, unsigned long address, struct vm_area_struct *vma) { /* This is only called for the current mm, so: */ bool foreign = false; @@ -1120,19 +1125,27 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) if (error_code & X86_PF_WRITE) { /* write, present and write, not present: */ if (unlikely(!(vma->vm_flags & VM_WRITE))) - return 1; + goto maybe_spurious; return 0; } /* read, present: */ if (unlikely(error_code & X86_PF_PROT)) - return 1; + goto maybe_spurious; /* read, not present: */ if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) - return 1; + goto maybe_spurious; return 0; + +maybe_spurious: + if (spurious_page_table_check(error_code, address)) { + WARN_ONCE(1, "spurious fault? Failed VMA check, passed page table check!\n"); + return 0; + } + + return 1; } static int fault_in_kernel_space(unsigned long address) @@ -1402,7 +1415,7 @@ void do_user_addr_fault(struct pt_regs *regs, * we can handle it.. */ good_area: - if (unlikely(access_error(sw_error_code, vma))) { + if (unlikely(access_error(sw_error_code, address, vma))) { bad_area_access_error(regs, sw_error_code, address, vma); return; } -- 2.14.4