Module Name: src Committed By: maxv Date: Sun Feb 25 12:37:16 UTC 2018
Modified Files: src/sys/arch/amd64/amd64: amd64_trap.S locore.S trap.c Log Message: Fix handling of segment register faults when running with SVS. The behavior is changed also in the non-SVS case. I've put a documentation in amd64_trap.S. Basically, the problem with SVS is that if iret faults, we already have a full trapframe pushed on the stack and the CPU will push another frame on this stack (nested), but it hits the redzone below the stack since it is still running with the user page table loaded. To fix that, we pop a good part of the trapframe earlier in intrfastexit. If iret faults, the current %rsp has enough room for an iret frame, and the CPU can push that without problem. We then switch back to the outer iret frame (the frame the CPU was trying to pop by executing iret, but that it didn't pop for real because iret faulted), call INTRENTRY, and handle the trap as if it had been received from userland directly. To generate a diff of this commit: cvs rdiff -u -r1.36 -r1.37 src/sys/arch/amd64/amd64/amd64_trap.S cvs rdiff -u -r1.156 -r1.157 src/sys/arch/amd64/amd64/locore.S cvs rdiff -u -r1.111 -r1.112 src/sys/arch/amd64/amd64/trap.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/arch/amd64/amd64/amd64_trap.S diff -u src/sys/arch/amd64/amd64/amd64_trap.S:1.36 src/sys/arch/amd64/amd64/amd64_trap.S:1.37 --- src/sys/arch/amd64/amd64/amd64_trap.S:1.36 Sun Feb 25 11:57:44 2018 +++ src/sys/arch/amd64/amd64/amd64_trap.S Sun Feb 25 12:37:16 2018 @@ -1,4 +1,4 @@ -/* $NetBSD: amd64_trap.S,v 1.36 2018/02/25 11:57:44 maxv Exp $ */ +/* $NetBSD: amd64_trap.S,v 1.37 2018/02/25 12:37:16 maxv Exp $ */ /* * Copyright (c) 1998, 2007, 2008, 2017 The NetBSD Foundation, Inc. @@ -368,69 +368,139 @@ IDTVEC_END(intrspurious) #ifndef kernuser_reenter /* * We need to worry about traps in kernel mode while the kernel %gs isn't - * loaded. These are either faults on iretq during return to user or loads to - * %gs. + * loaded. When such traps happen, we have CPL=0 and %gs=userland, and we + * must perform an additional swapgs to get %gs=kernel. + */ + +#define TF_SMALL(val, reg) (val - TF_REGSIZE)(reg) +#define TF_SMALL_REGPUSHED(val, reg) (val - (TF_REGSIZE - 8))(reg) + +/* + * It is possible that we received a trap in kernel mode, but with the user + * context loaded. There are six cases where this can happen: * - * When such traps happen, we have CPL=0 and %gs=userland, and we must perform - * an additional swapgs to get %gs=kernel. + * o Execution of SYSRETQ. + * o Execution of IRETQ. + * o Reload of ES. + * o Reload of DS. + * o Reload of FS. + * o Reload of GS. + * + * When this happens, the kernel is re-entered in kernel mode, but the + * previous context is in kernel mode too. + * + * We have two iret frames in the stack. In the first one, the 'rsp' field + * points to the outer iret frame: + * + * +---------------------------------------------------+ + * | trapno | err | rip | cs=ring0 | rflags | rsp | ss | + * +-------------------------------------------|-------+ + * | + * +---------------------------------+ + * | + * | +---------------------------------------------------+ + * +--> | trapno | err | rip | cs=ring3 | rflags | rsp | ss | + * +---------------------------------------------------+ + * + * We perform a three-step procedure: + * + * o We copy the 'trapno' field of the current frame into the 'trapno' + * field of the outer frame. + * + * o We update RSP to point to the outer frame. This outer frame is in the + * same stack as the current frame, and likely just after the current + * frame. + * + * o We do a normal INTRENTRY. Now that RSP points to the outer frame, + * everything behaves as if we had received a trap from the outer frame, + * that is to say, from userland directly. + * + * Finally, we jump to 'calltrap' and handle the trap smoothly. + * + * Two notes regarding SVS: + * + * o With SVS, we will receive the trap while the user page tables are + * loaded. That's not a problem, we don't touch anything unmapped here. + * + * o With SVS, when the user page tables are loaded, the stack is really + * small, and can contain only one trapframe structure. Therefore, in + * intrfastexit, we must save the GPRs and pop their part of the stack + * right away. If we weren't doing that, and the reload of ES faulted for + * example, then the CPU would try to push an iret frame on the current + * stack (nested), and would double-fault because it touches the redzone + * below the stack (see the documentation in x86/x86/svs.c). By popping + * the GPR part of the stack, we leave enough stack for the CPU to push + * an iret frame, and for us to push two 8-byte registers too. */ _ALIGN_TEXT LABEL(kernuser_reenter) - INTRENTRY_L(3f,1:) -2: + testb $SEL_UPL,TF_SMALL(TF_CS, %rsp) + jz .Lkernelmode + +.Lnormal_entry: + INTRENTRY sti jmp calltrap -3: - /* - * Trap in kernel mode. - */ + +.Lkernelmode: + /* We will clobber %rdi */ + pushq %rdi /* Case 1: fault on sysretq? */ leaq do_sysret(%rip),%rdi - cmpq %rdi,TF_RIP(%rsp) - je 1b + cmpq %rdi,TF_SMALL_REGPUSHED(TF_RIP, %rsp) + je .Lkernelmode_but_user /* Case 2: fault on iretq? */ leaq do_iret(%rip),%rdi - cmpq %rdi,TF_RIP(%rsp) + cmpq %rdi,TF_SMALL_REGPUSHED(TF_RIP, %rsp) jne 5f - movq TF_RSP(%rsp),%rdi /* Must read %rsp, may be a pad word */ - testb $SEL_UPL,8(%rdi) /* Check %cs of outer iret frame */ - je 2b /* jump if iret was to kernel */ - jmp 1b /* to user - must restore %gs */ + movq TF_SMALL_REGPUSHED(TF_RSP, %rsp),%rdi /* get %rsp */ + testb $SEL_UPL,8(%rdi) /* check %cs of outer iret frame */ + je .Lnormal_entry /* jump if iret was to kernel */ + jmp .Lkernelmode_but_user /* to user - must restore %gs */ 5: - /* Case 3: move to %gs? */ + /* Case 3: move to %es? */ + leaq do_mov_es(%rip),%rdi + cmpq %rdi,TF_SMALL_REGPUSHED(TF_RIP, %rsp) + je .Lkernelmode_but_user + + /* Case 4: move to %ds? */ + leaq do_mov_ds(%rip),%rdi + cmpq %rdi,TF_SMALL_REGPUSHED(TF_RIP, %rsp) + je .Lkernelmode_but_user + + /* Case 5: move to %fs? */ + leaq do_mov_fs(%rip),%rdi + cmpq %rdi,TF_SMALL_REGPUSHED(TF_RIP, %rsp) + je .Lkernelmode_but_user + + /* Case 6: move to %gs? */ leaq do_mov_gs(%rip),%rdi - cmpq %rdi,TF_RIP(%rsp) - je 1b + cmpq %rdi,TF_SMALL_REGPUSHED(TF_RIP, %rsp) + je .Lkernelmode_but_user - /* None of the above cases */ - jmp 2b /* normal kernel fault */ + /* None of the above cases: normal kernel fault */ + popq %rdi + jmp .Lnormal_entry + +.Lkernelmode_but_user: + movq TF_SMALL_REGPUSHED(TF_RSP, %rsp),%rdi + + pushq %rax + movq 16(%rsp),%rax /* 16(%rsp) = current TF_TRAPNO */ + movq %rax,(%rdi) /* (%rdi) = outer TF_TRAPNO */ + popq %rax + + movq %rdi,%rsp + jmp .Lnormal_entry END(kernuser_reenter) #endif TEXT_USER_END /* - * trap() calls here when it detects a fault in INTRFASTEXIT (loading the - * segment registers or during the iret itself). The address of the (possibly - * reconstructed) user trap frame is passed as an argument. - * - * Typically the code will have raised a SIGSEGV which will be actioned - * by the code below. - */ - .type _C_LABEL(trap_return_fault_return), @function -LABEL(trap_return_fault_return) - mov %rdi,%rsp /* frame for user return */ -#ifdef DIAGNOSTIC - /* We can't recover the saved %rbx, so suppress warning */ - movl CPUVAR(ILEVEL),%ebx -#endif - jmp .Lalltraps_checkusr -END(trap_return_fault_return) - -/* * All traps go through here. Call the generic trap handler, and * check for ASTs afterwards. */ Index: src/sys/arch/amd64/amd64/locore.S diff -u src/sys/arch/amd64/amd64/locore.S:1.156 src/sys/arch/amd64/amd64/locore.S:1.157 --- src/sys/arch/amd64/amd64/locore.S:1.156 Sat Feb 24 17:12:10 2018 +++ src/sys/arch/amd64/amd64/locore.S Sun Feb 25 12:37:16 2018 @@ -1,4 +1,4 @@ -/* $NetBSD: locore.S,v 1.156 2018/02/24 17:12:10 maxv Exp $ */ +/* $NetBSD: locore.S,v 1.157 2018/02/25 12:37:16 maxv Exp $ */ /* * Copyright-o-rama! @@ -1544,33 +1544,43 @@ END(pagezero) TEXT_USER_BEGIN +/* + * In intrfastexit, we advance %rsp at the beginning. We then access the + * segment registers in the trapframe with TF_BACKW (backwards). See the + * documentation in amd64_trap.S for an explanation. + */ + +#define TF_BACKW(val, reg) (val - TF_REGSIZE)(reg) + _ALIGN_TEXT LABEL(intrfastexit) NOT_XEN(cli;) SVS_LEAVE INTR_RESTORE_GPRS - testb $SEL_UPL,TF_CS(%rsp) /* interrupted %cs */ + addq $TF_REGSIZE,%rsp /* iret frame */ + + testb $SEL_UPL,TF_BACKW(TF_CS, %rsp) jz .Lkexit - cmpw $LSEL(LUCODE_SEL, SEL_UPL),TF_CS(%rsp) + cmpw $LSEL(LUCODE_SEL, SEL_UPL),TF_BACKW(TF_CS, %rsp) je .Luexit64 - cmpw $GSEL(GUCODE_SEL, SEL_UPL),TF_CS(%rsp) + cmpw $GSEL(GUCODE_SEL, SEL_UPL),TF_BACKW(TF_CS, %rsp) je .Luexit64 #ifdef XEN - cmpw $FLAT_RING3_CS64,TF_CS(%rsp) + cmpw $FLAT_RING3_CS64,TF_BACKW(TF_CS, %rsp) je .Luexit64 #endif .Luexit32: + SWAPGS do_mov_es: - movw TF_ES(%rsp),%es + movw TF_BACKW(TF_ES, %rsp),%es do_mov_ds: - movw TF_DS(%rsp),%ds + movw TF_BACKW(TF_DS, %rsp),%ds do_mov_fs: - movw TF_FS(%rsp),%fs - SWAPGS + movw TF_BACKW(TF_FS, %rsp),%fs #ifndef XEN do_mov_gs: - movw TF_GS(%rsp),%gs + movw TF_BACKW(TF_GS, %rsp),%gs #endif jmp .Lkexit @@ -1578,7 +1588,7 @@ do_mov_gs: SWAPGS .Lkexit: - addq $TF_REGSIZE+16,%rsp /* + T_xxx and error code */ + addq $16,%rsp /* 16 = T_xxx + error code */ do_iret: iretq END(intrfastexit) Index: src/sys/arch/amd64/amd64/trap.c diff -u src/sys/arch/amd64/amd64/trap.c:1.111 src/sys/arch/amd64/amd64/trap.c:1.112 --- src/sys/arch/amd64/amd64/trap.c:1.111 Sat Jan 20 08:30:53 2018 +++ src/sys/arch/amd64/amd64/trap.c Sun Feb 25 12:37:16 2018 @@ -1,4 +1,4 @@ -/* $NetBSD: trap.c,v 1.111 2018/01/20 08:30:53 maxv Exp $ */ +/* $NetBSD: trap.c,v 1.112 2018/02/25 12:37:16 maxv Exp $ */ /* * Copyright (c) 1998, 2000, 2017 The NetBSD Foundation, Inc. @@ -64,7 +64,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: trap.c,v 1.111 2018/01/20 08:30:53 maxv Exp $"); +__KERNEL_RCSID(0, "$NetBSD: trap.c,v 1.112 2018/02/25 12:37:16 maxv Exp $"); #include "opt_ddb.h" #include "opt_kgdb.h" @@ -246,93 +246,6 @@ doubletrap(struct trapframe *frame) } /* - * Did we receive in kernel mode a trap that ought to be considered as a user - * trap? If this function returns, the answer is no. - * - * Such traps can be triggered when the kernel fails to return to userland, - * because of incorrect segment registers. - */ -#ifndef XEN -static void trap_user_kernelmode(struct trapframe *, int, lwp_t *, proc_t *); - -static void -trap_user_kernelmode(struct trapframe *frame, int type, lwp_t *l, proc_t *p) -{ - extern uint64_t do_mov_es, do_mov_ds, do_mov_fs, do_mov_gs; - extern uint64_t do_iret; - struct trapframe *vframe; - ksiginfo_t ksi; - - if (frame->tf_rip == 0) { - /* - * Assume that if we jumped to null we probably did it via a - * null function pointer, so print the return address. - */ - printf("kernel jumped to null; return addr was %p\n", - *(void **)frame->tf_rsp); - return; - } - - KSI_INIT_TRAP(&ksi); - ksi.ksi_signo = SIGSEGV; - ksi.ksi_code = SEGV_ACCERR; - ksi.ksi_trap = type; - - /* - * Get %rsp value before fault - there may be a pad word below the - * trap frame. - */ - vframe = (void *)frame->tf_rsp; - - if (frame->tf_rip == (uint64_t)&do_iret) { - /* - * The 'iretq' instruction faulted, so we have the - * 'user' registers saved after the kernel - * %rip:%cs:%fl:%rsp:%ss of the iret, and below that - * the user %rip:%cs:%fl:%rsp:%ss the 'iret' was - * processing. - * We must copy the user register back over the - * kernel fault frame to generate a normal stack - * frame (eg for sending a SIGSEGV). - */ - vframe = (void *)((char *)vframe - - offsetof(struct trapframe, tf_rip)); - memmove(vframe, frame, offsetof(struct trapframe, tf_rip)); - /* Set the faulting address to the user %rip */ - ksi.ksi_addr = (void *)vframe->tf_rip; - } else if (frame->tf_rip == (uint64_t)&do_mov_es || - frame->tf_rip == (uint64_t)&do_mov_ds || - frame->tf_rip == (uint64_t)&do_mov_fs || - frame->tf_rip == (uint64_t)&do_mov_gs) { - /* - * We faulted loading one of the user segment registers. - * The stack frame containing the user registers is - * still valid and pointed to by tf_rsp. - */ - if (KERNELMODE(vframe->tf_cs)) - return; - /* There is no valid address for the fault */ - } else { - return; - } - - /* XXX: worry about on-stack trampolines for nested handlers?? */ - /* Save outer frame for any signal return */ - l->l_md.md_regs = vframe; - (*p->p_emul->e_trapsignal)(l, &ksi); - /* Return to user by reloading the user frame */ - trap_return_fault_return(vframe); - /* NOTREACHED */ -} -#else -/* - * XXX: there has to be an equivalent 'problem' but I (dsl) don't know exactly - * what happens! For now panic the kernel. - */ -#define trap_user_kernelmode(frame, type, l, p) /* NOTHING */ -#endif - -/* * trap(frame): exception, fault, and trap interface to BSD kernel. * * This common code is called from assembly language IDT gate entry routines @@ -433,7 +346,6 @@ trap(struct trapframe *frame) return; } - trap_user_kernelmode(frame, type, l, p); goto we_re_toast; case T_PROTFLT|T_USER: /* protection fault */