Module Name:    src
Committed By:   maxv
Date:           Sun Feb 25 12:37:16 UTC 2018

Modified Files:
        src/sys/arch/amd64/amd64: amd64_trap.S locore.S trap.c

Log Message:
Fix handling of segment register faults when running with SVS. The behavior
is changed also in the non-SVS case.

I've put a documentation in amd64_trap.S. Basically, the problem with SVS
is that if iret faults, we already have a full trapframe pushed on the
stack and the CPU will push another frame on this stack (nested), but it
hits the redzone below the stack since it is still running with the user
page table loaded.

To fix that, we pop a good part of the trapframe earlier in intrfastexit.
If iret faults, the current %rsp has enough room for an iret frame, and
the CPU can push that without problem. We then switch back to the outer
iret frame (the frame the CPU was trying to pop by executing iret, but that
it didn't pop for real because iret faulted), call INTRENTRY, and handle
the trap as if it had been received from userland directly.


To generate a diff of this commit:
cvs rdiff -u -r1.36 -r1.37 src/sys/arch/amd64/amd64/amd64_trap.S
cvs rdiff -u -r1.156 -r1.157 src/sys/arch/amd64/amd64/locore.S
cvs rdiff -u -r1.111 -r1.112 src/sys/arch/amd64/amd64/trap.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/amd64/amd64/amd64_trap.S
diff -u src/sys/arch/amd64/amd64/amd64_trap.S:1.36 src/sys/arch/amd64/amd64/amd64_trap.S:1.37
--- src/sys/arch/amd64/amd64/amd64_trap.S:1.36	Sun Feb 25 11:57:44 2018
+++ src/sys/arch/amd64/amd64/amd64_trap.S	Sun Feb 25 12:37:16 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: amd64_trap.S,v 1.36 2018/02/25 11:57:44 maxv Exp $	*/
+/*	$NetBSD: amd64_trap.S,v 1.37 2018/02/25 12:37:16 maxv Exp $	*/
 
 /*
  * Copyright (c) 1998, 2007, 2008, 2017 The NetBSD Foundation, Inc.
@@ -368,69 +368,139 @@ IDTVEC_END(intrspurious)
 #ifndef kernuser_reenter
 /*
  * We need to worry about traps in kernel mode while the kernel %gs isn't
- * loaded. These are either faults on iretq during return to user or loads to
- * %gs.
+ * loaded. When such traps happen, we have CPL=0 and %gs=userland, and we
+ * must perform an additional swapgs to get %gs=kernel.
+ */
+
+#define TF_SMALL(val, reg)		(val - TF_REGSIZE)(reg)
+#define TF_SMALL_REGPUSHED(val, reg)	(val - (TF_REGSIZE - 8))(reg)
+
+/*
+ * It is possible that we received a trap in kernel mode, but with the user
+ * context loaded. There are six cases where this can happen:
  *
- * When such traps happen, we have CPL=0 and %gs=userland, and we must perform
- * an additional swapgs to get %gs=kernel.
+ *  o Execution of SYSRETQ.
+ *  o Execution of IRETQ.
+ *  o Reload of ES.
+ *  o Reload of DS.
+ *  o Reload of FS.
+ *  o Reload of GS.
+ *
+ * When this happens, the kernel is re-entered in kernel mode, but the
+ * previous context is in kernel mode too.
+ *
+ * We have two iret frames in the stack. In the first one, the 'rsp' field
+ * points to the outer iret frame:
+ *
+ * +---------------------------------------------------+
+ * | trapno | err | rip | cs=ring0 | rflags | rsp | ss |
+ * +-------------------------------------------|-------+
+ *                                             |
+ *           +---------------------------------+
+ *           |
+ *           |    +---------------------------------------------------+
+ *           +--> | trapno | err | rip | cs=ring3 | rflags | rsp | ss |
+ *                +---------------------------------------------------+
+ *
+ * We perform a three-step procedure:
+ *
+ *  o We copy the 'trapno' field of the current frame into the 'trapno'
+ *    field of the outer frame.
+ *
+ *  o We update RSP to point to the outer frame. This outer frame is in the
+ *    same stack as the current frame, and likely just after the current
+ *    frame.
+ *
+ *  o We do a normal INTRENTRY. Now that RSP points to the outer frame,
+ *    everything behaves as if we had received a trap from the outer frame,
+ *    that is to say, from userland directly.
+ *
+ * Finally, we jump to 'calltrap' and handle the trap smoothly.
+ *
+ * Two notes regarding SVS:
+ *
+ *  o With SVS, we will receive the trap while the user page tables are
+ *    loaded. That's not a problem, we don't touch anything unmapped here.
+ *
+ *  o With SVS, when the user page tables are loaded, the stack is really
+ *    small, and can contain only one trapframe structure. Therefore, in
+ *    intrfastexit, we must save the GPRs and pop their part of the stack
+ *    right away. If we weren't doing that, and the reload of ES faulted for
+ *    example, then the CPU would try to push an iret frame on the current
+ *    stack (nested), and would double-fault because it touches the redzone
+ *    below the stack (see the documentation in x86/x86/svs.c). By popping
+ *    the GPR part of the stack, we leave enough stack for the CPU to push
+ *    an iret frame, and for us to push two 8-byte registers too.
  */
 	_ALIGN_TEXT
 LABEL(kernuser_reenter)
-	INTRENTRY_L(3f,1:)
-2:
+	testb	$SEL_UPL,TF_SMALL(TF_CS, %rsp)
+	jz	.Lkernelmode
+
+.Lnormal_entry:
+	INTRENTRY
 	sti
 	jmp	calltrap
-3:
-	/*
-	 * Trap in kernel mode.
-	 */
+
+.Lkernelmode:
+	/* We will clobber %rdi */
+	pushq	%rdi
 
 	/* Case 1: fault on sysretq? */
 	leaq	do_sysret(%rip),%rdi
-	cmpq	%rdi,TF_RIP(%rsp)
-	je	1b
+	cmpq	%rdi,TF_SMALL_REGPUSHED(TF_RIP, %rsp)
+	je	.Lkernelmode_but_user
 
 	/* Case 2: fault on iretq? */
 	leaq	do_iret(%rip),%rdi
-	cmpq	%rdi,TF_RIP(%rsp)
+	cmpq	%rdi,TF_SMALL_REGPUSHED(TF_RIP, %rsp)
 	jne	5f
-	movq	TF_RSP(%rsp),%rdi	/* Must read %rsp, may be a pad word */
-	testb	$SEL_UPL,8(%rdi)	/* Check %cs of outer iret frame */
-	je	2b			/* jump if iret was to kernel  */
-	jmp	1b			/* to user - must restore %gs */
+	movq	TF_SMALL_REGPUSHED(TF_RSP, %rsp),%rdi	/* get %rsp */
+	testb	$SEL_UPL,8(%rdi)	/* check %cs of outer iret frame */
+	je	.Lnormal_entry		/* jump if iret was to kernel  */
+	jmp	.Lkernelmode_but_user	/* to user - must restore %gs */
 5:
 
-	/* Case 3: move to %gs? */
+	/* Case 3: move to %es? */
+	leaq	do_mov_es(%rip),%rdi
+	cmpq	%rdi,TF_SMALL_REGPUSHED(TF_RIP, %rsp)
+	je	.Lkernelmode_but_user
+
+	/* Case 4: move to %ds? */
+	leaq	do_mov_ds(%rip),%rdi
+	cmpq	%rdi,TF_SMALL_REGPUSHED(TF_RIP, %rsp)
+	je	.Lkernelmode_but_user
+
+	/* Case 5: move to %fs? */
+	leaq	do_mov_fs(%rip),%rdi
+	cmpq	%rdi,TF_SMALL_REGPUSHED(TF_RIP, %rsp)
+	je	.Lkernelmode_but_user
+
+	/* Case 6: move to %gs? */
 	leaq	do_mov_gs(%rip),%rdi
-	cmpq	%rdi,TF_RIP(%rsp)
-	je	1b
+	cmpq	%rdi,TF_SMALL_REGPUSHED(TF_RIP, %rsp)
+	je	.Lkernelmode_but_user
 
-	/* None of the above cases */
-	jmp	2b	/* normal kernel fault */
+	/* None of the above cases: normal kernel fault */
+	popq	%rdi
+	jmp	.Lnormal_entry
+
+.Lkernelmode_but_user:
+	movq	TF_SMALL_REGPUSHED(TF_RSP, %rsp),%rdi
+
+	pushq	%rax
+	movq	16(%rsp),%rax	/* 16(%rsp) = current TF_TRAPNO */
+	movq	%rax,(%rdi)	/* (%rdi) = outer TF_TRAPNO */
+	popq	%rax
+
+	movq	%rdi,%rsp
+	jmp	.Lnormal_entry
 END(kernuser_reenter)
 #endif
 
 	TEXT_USER_END
 
 /*
- * trap() calls here when it detects a fault in INTRFASTEXIT (loading the
- * segment registers or during the iret itself). The address of the (possibly
- * reconstructed) user trap frame is passed as an argument.
- *
- * Typically the code will have raised a SIGSEGV which will be actioned
- * by the code below.
- */
-	.type	_C_LABEL(trap_return_fault_return), @function
-LABEL(trap_return_fault_return)
-	mov	%rdi,%rsp		/* frame for user return */
-#ifdef DIAGNOSTIC
-	/* We can't recover the saved %rbx, so suppress warning */
-	movl	CPUVAR(ILEVEL),%ebx
-#endif
-	jmp	.Lalltraps_checkusr
-END(trap_return_fault_return)
-
-/*
  * All traps go through here. Call the generic trap handler, and
  * check for ASTs afterwards.
  */

Index: src/sys/arch/amd64/amd64/locore.S
diff -u src/sys/arch/amd64/amd64/locore.S:1.156 src/sys/arch/amd64/amd64/locore.S:1.157
--- src/sys/arch/amd64/amd64/locore.S:1.156	Sat Feb 24 17:12:10 2018
+++ src/sys/arch/amd64/amd64/locore.S	Sun Feb 25 12:37:16 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: locore.S,v 1.156 2018/02/24 17:12:10 maxv Exp $	*/
+/*	$NetBSD: locore.S,v 1.157 2018/02/25 12:37:16 maxv Exp $	*/
 
 /*
  * Copyright-o-rama!
@@ -1544,33 +1544,43 @@ END(pagezero)
 
 	TEXT_USER_BEGIN
 
+/*
+ * In intrfastexit, we advance %rsp at the beginning. We then access the
+ * segment registers in the trapframe with TF_BACKW (backwards). See the
+ * documentation in amd64_trap.S for an explanation.
+ */
+
+#define TF_BACKW(val, reg)	(val - TF_REGSIZE)(reg)
+
 	_ALIGN_TEXT
 LABEL(intrfastexit)
 	NOT_XEN(cli;)
 	SVS_LEAVE
 	INTR_RESTORE_GPRS
-	testb	$SEL_UPL,TF_CS(%rsp)	/* interrupted %cs */
+	addq	$TF_REGSIZE,%rsp	/* iret frame */
+
+	testb	$SEL_UPL,TF_BACKW(TF_CS, %rsp)
 	jz	.Lkexit
-	cmpw	$LSEL(LUCODE_SEL, SEL_UPL),TF_CS(%rsp)
+	cmpw	$LSEL(LUCODE_SEL, SEL_UPL),TF_BACKW(TF_CS, %rsp)
 	je	.Luexit64
-	cmpw	$GSEL(GUCODE_SEL, SEL_UPL),TF_CS(%rsp)
+	cmpw	$GSEL(GUCODE_SEL, SEL_UPL),TF_BACKW(TF_CS, %rsp)
 	je	.Luexit64
 #ifdef XEN
-	cmpw	$FLAT_RING3_CS64,TF_CS(%rsp)
+	cmpw	$FLAT_RING3_CS64,TF_BACKW(TF_CS, %rsp)
 	je	.Luexit64
 #endif
 
 .Luexit32:
+	SWAPGS
 do_mov_es:
-	movw	TF_ES(%rsp),%es
+	movw	TF_BACKW(TF_ES, %rsp),%es
 do_mov_ds:
-	movw	TF_DS(%rsp),%ds
+	movw	TF_BACKW(TF_DS, %rsp),%ds
 do_mov_fs:
-	movw	TF_FS(%rsp),%fs
-	SWAPGS
+	movw	TF_BACKW(TF_FS, %rsp),%fs
 #ifndef XEN
 do_mov_gs:
-	movw	TF_GS(%rsp),%gs
+	movw	TF_BACKW(TF_GS, %rsp),%gs
 #endif
 	jmp	.Lkexit
 
@@ -1578,7 +1588,7 @@ do_mov_gs:
 	SWAPGS
 
 .Lkexit:
-	addq	$TF_REGSIZE+16,%rsp	/* + T_xxx and error code */
+	addq	$16,%rsp	/* 16 = T_xxx + error code */
 do_iret:
 	iretq
 END(intrfastexit)

Index: src/sys/arch/amd64/amd64/trap.c
diff -u src/sys/arch/amd64/amd64/trap.c:1.111 src/sys/arch/amd64/amd64/trap.c:1.112
--- src/sys/arch/amd64/amd64/trap.c:1.111	Sat Jan 20 08:30:53 2018
+++ src/sys/arch/amd64/amd64/trap.c	Sun Feb 25 12:37:16 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: trap.c,v 1.111 2018/01/20 08:30:53 maxv Exp $	*/
+/*	$NetBSD: trap.c,v 1.112 2018/02/25 12:37:16 maxv Exp $	*/
 
 /*
  * Copyright (c) 1998, 2000, 2017 The NetBSD Foundation, Inc.
@@ -64,7 +64,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: trap.c,v 1.111 2018/01/20 08:30:53 maxv Exp $");
+__KERNEL_RCSID(0, "$NetBSD: trap.c,v 1.112 2018/02/25 12:37:16 maxv Exp $");
 
 #include "opt_ddb.h"
 #include "opt_kgdb.h"
@@ -246,93 +246,6 @@ doubletrap(struct trapframe *frame)
 }
 
 /*
- * Did we receive in kernel mode a trap that ought to be considered as a user
- * trap? If this function returns, the answer is no.
- *
- * Such traps can be triggered when the kernel fails to return to userland,
- * because of incorrect segment registers.
- */
-#ifndef XEN
-static void trap_user_kernelmode(struct trapframe *, int, lwp_t *, proc_t *);
-
-static void
-trap_user_kernelmode(struct trapframe *frame, int type, lwp_t *l, proc_t *p)
-{
-	extern uint64_t do_mov_es, do_mov_ds, do_mov_fs, do_mov_gs;
-	extern uint64_t do_iret;
-	struct trapframe *vframe;
-	ksiginfo_t ksi;
-
-	if (frame->tf_rip == 0) {
-		/*
-		 * Assume that if we jumped to null we probably did it via a
-		 * null function pointer, so print the return address.
-		 */
-		printf("kernel jumped to null; return addr was %p\n",
-		    *(void **)frame->tf_rsp);
-		return;
-	}
-
-	KSI_INIT_TRAP(&ksi);
-	ksi.ksi_signo = SIGSEGV;
-	ksi.ksi_code = SEGV_ACCERR;
-	ksi.ksi_trap = type;
-
-	/*
-	 * Get %rsp value before fault - there may be a pad word below the
-	 * trap frame.
-	 */
-	vframe = (void *)frame->tf_rsp;
-
-	if (frame->tf_rip == (uint64_t)&do_iret) {
-		/*
-		 * The 'iretq' instruction faulted, so we have the
-		 * 'user' registers saved after the kernel
-		 * %rip:%cs:%fl:%rsp:%ss of the iret, and below that
-		 * the user %rip:%cs:%fl:%rsp:%ss the 'iret' was
-		 * processing.
-		 * We must copy the user register back over the
-		 * kernel fault frame to generate a normal stack
-		 * frame (eg for sending a SIGSEGV).
-		 */
-		vframe = (void *)((char *)vframe
-		    - offsetof(struct trapframe, tf_rip));
-		memmove(vframe, frame, offsetof(struct trapframe, tf_rip));
-		/* Set the faulting address to the user %rip */
-		ksi.ksi_addr = (void *)vframe->tf_rip;
-	} else if (frame->tf_rip == (uint64_t)&do_mov_es ||
-	    frame->tf_rip == (uint64_t)&do_mov_ds ||
-	    frame->tf_rip == (uint64_t)&do_mov_fs ||
-	    frame->tf_rip == (uint64_t)&do_mov_gs) {
-		/*
-		 * We faulted loading one of the user segment registers.
-		 * The stack frame containing the user registers is
-		 * still valid and pointed to by tf_rsp.
-		 */
-		if (KERNELMODE(vframe->tf_cs))
-			return;
-		/* There is no valid address for the fault */
-	} else {
-		return;
-	}
-
-	/* XXX: worry about on-stack trampolines for nested handlers?? */
-	/* Save outer frame for any signal return */
-	l->l_md.md_regs = vframe;
-	(*p->p_emul->e_trapsignal)(l, &ksi);
-	/* Return to user by reloading the user frame */
-	trap_return_fault_return(vframe);
-	/* NOTREACHED */
-}
-#else
-/*
- * XXX: there has to be an equivalent 'problem' but I (dsl) don't know exactly
- * what happens! For now panic the kernel.
- */
-#define trap_user_kernelmode(frame, type, l, p)	/* NOTHING */
-#endif
-
-/*
  * trap(frame): exception, fault, and trap interface to BSD kernel.
  *
  * This common code is called from assembly language IDT gate entry routines
@@ -433,7 +346,6 @@ trap(struct trapframe *frame)
 			return;
 		}
 
-		trap_user_kernelmode(frame, type, l, p);
 		goto we_re_toast;
 
 	case T_PROTFLT|T_USER:		/* protection fault */

Reply via email to