Module Name:    src
Committed By:   martin
Date:           Thu Mar 22 16:59:04 UTC 2018

Modified Files:
        src/sys/arch/amd64/amd64 [netbsd-8]: amd64_trap.S db_machdep.c
            genassym.cf locore.S machdep.c trap.c vector.S
        src/sys/arch/amd64/conf [netbsd-8]: GENERIC kern.ldscript
        src/sys/arch/amd64/include [netbsd-8]: frameasm.h param.h pmap.h
        src/sys/arch/x86/conf [netbsd-8]: files.x86
        src/sys/arch/x86/include [netbsd-8]: cpu.h pmap.h
        src/sys/arch/x86/x86 [netbsd-8]: cpu.c pmap.c vm_machdep.c
            x86_machdep.c
        src/sys/arch/xen/conf [netbsd-8]: files.compat
Added Files:
        src/sys/arch/x86/x86 [netbsd-8]: svs.c

Log Message:
Pull up the following revisions, requested by maxv in ticket #652:

        sys/arch/amd64/amd64/amd64_trap.S       upto 1.39 (partial, patch)
        sys/arch/amd64/amd64/db_machdep.c       1.6 (patch)
        sys/arch/amd64/amd64/genassym.cf        1.65,1.66,1.67 (patch)
        sys/arch/amd64/amd64/locore.S           upto 1.159 (partial, patch)
        sys/arch/amd64/amd64/machdep.c          1.299-1.302 (patch)
        sys/arch/amd64/amd64/trap.c             upto 1.113 (partial, patch)
        sys/arch/amd64/amd64/amd64/vector.S     upto 1.61 (partial, patch)
        sys/arch/amd64/conf/GENERIC             1.477,1.478 (patch)
        sys/arch/amd64/conf/kern.ldscript       1.26 (patch)
        sys/arch/amd64/include/frameasm.h       upto 1.37 (partial, patch)
        sys/arch/amd64/include/param.h          1.25 (patch)
        sys/arch/amd64/include/pmap.h           1.41,1.43,1.44 (patch)
        sys/arch/x86/conf/files.x86             1.91,1.93 (patch)
        sys/arch/x86/include/cpu.h              1.88,1.89 (patch)
        sys/arch/x86/include/pmap.h             1.75 (patch)
        sys/arch/x86/x86/cpu.c                  1.144,1.146,1.148,1.149 (patch)
        sys/arch/x86/x86/pmap.c                 upto 1.289 (partial, patch)
        sys/arch/x86/x86/vm_machdep.c           1.31,1.32 (patch)
        sys/arch/x86/x86/x86_machdep.c          1.104,1.106,1.108 (patch)
        sys/arch/x86/x86/svs.c                  1.1-1.14
        sys/arch/xen/conf/files.compat          1.30 (patch)

Backport SVS. Not enabled yet.


To generate a diff of this commit:
cvs rdiff -u -r1.5.6.1 -r1.5.6.2 src/sys/arch/amd64/amd64/amd64_trap.S
cvs rdiff -u -r1.4 -r1.4.30.1 src/sys/arch/amd64/amd64/db_machdep.c
cvs rdiff -u -r1.60.10.1 -r1.60.10.2 src/sys/arch/amd64/amd64/genassym.cf
cvs rdiff -u -r1.123.6.4 -r1.123.6.5 src/sys/arch/amd64/amd64/locore.S
cvs rdiff -u -r1.255.6.5 -r1.255.6.6 src/sys/arch/amd64/amd64/machdep.c
cvs rdiff -u -r1.96.4.1 -r1.96.4.2 src/sys/arch/amd64/amd64/trap.c
cvs rdiff -u -r1.49.2.1 -r1.49.2.2 src/sys/arch/amd64/amd64/vector.S
cvs rdiff -u -r1.459.2.5 -r1.459.2.6 src/sys/arch/amd64/conf/GENERIC
cvs rdiff -u -r1.22.6.2 -r1.22.6.3 src/sys/arch/amd64/conf/kern.ldscript
cvs rdiff -u -r1.20.32.1 -r1.20.32.2 src/sys/arch/amd64/include/frameasm.h
cvs rdiff -u -r1.21.6.1 -r1.21.6.2 src/sys/arch/amd64/include/param.h
cvs rdiff -u -r1.39 -r1.39.8.1 src/sys/arch/amd64/include/pmap.h
cvs rdiff -u -r1.88 -r1.88.6.1 src/sys/arch/x86/conf/files.x86
cvs rdiff -u -r1.71.2.3 -r1.71.2.4 src/sys/arch/x86/include/cpu.h
cvs rdiff -u -r1.64.6.1 -r1.64.6.2 src/sys/arch/x86/include/pmap.h
cvs rdiff -u -r1.130.2.4 -r1.130.2.5 src/sys/arch/x86/x86/cpu.c
cvs rdiff -u -r1.245.6.5 -r1.245.6.6 src/sys/arch/x86/x86/pmap.c
cvs rdiff -u -r0 -r1.14.2.2 src/sys/arch/x86/x86/svs.c
cvs rdiff -u -r1.28.6.2 -r1.28.6.3 src/sys/arch/x86/x86/vm_machdep.c
cvs rdiff -u -r1.91.4.1 -r1.91.4.2 src/sys/arch/x86/x86/x86_machdep.c
cvs rdiff -u -r1.25.8.1 -r1.25.8.2 src/sys/arch/xen/conf/files.compat

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/amd64/amd64/amd64_trap.S
diff -u src/sys/arch/amd64/amd64/amd64_trap.S:1.5.6.1 src/sys/arch/amd64/amd64/amd64_trap.S:1.5.6.2
--- src/sys/arch/amd64/amd64/amd64_trap.S:1.5.6.1	Wed Mar  7 14:50:56 2018
+++ src/sys/arch/amd64/amd64/amd64_trap.S	Thu Mar 22 16:59:03 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: amd64_trap.S,v 1.5.6.1 2018/03/07 14:50:56 martin Exp $	*/
+/*	$NetBSD: amd64_trap.S,v 1.5.6.2 2018/03/22 16:59:03 martin Exp $	*/
 
 /*
  * Copyright (c) 1998, 2007, 2008, 2017 The NetBSD Foundation, Inc.
@@ -95,13 +95,19 @@
 #define	PRE_TRAP
 #endif
 
+#define TRAPENTRY			\
+	INTRENTRY			; \
+	jmp	.Lalltraps_noentry
+
 #define	TRAP_NJ(a)	PRE_TRAP ; pushq $(a)
 #define	ZTRAP_NJ(a)	PRE_TRAP ; pushq $0 ; pushq $(a)
-#define	TRAP(a)		TRAP_NJ(a) ; jmp _C_LABEL(alltraps)
-#define	ZTRAP(a)	ZTRAP_NJ(a) ; jmp _C_LABEL(alltraps)
+#define	TRAP(a)		TRAP_NJ(a) ; TRAPENTRY
+#define	ZTRAP(a)	ZTRAP_NJ(a) ; TRAPENTRY
 
 	.text
 
+	TEXT_USER_BEGIN
+
 IDTVEC(trap00)
 	ZTRAP(T_DIVIDE)
 IDTVEC_END(trap00)
@@ -128,6 +134,7 @@ IDTVEC(trap02)
 	ZTRAP_NJ(T_NMI)
 	subq	$TF_REGSIZE,%rsp
 	INTR_SAVE_GPRS
+	SVS_ENTER_ALTSTACK
 	cld
 	movw	%gs,TF_GS(%rsp)
 	movw	%fs,TF_FS(%rsp)
@@ -143,6 +150,7 @@ IDTVEC(trap02)
 	movq	%rsp,%rdi
 	incq	CPUVAR(NTRAP)
 	call	_C_LABEL(nmitrap)
+	SVS_LEAVE_ALTSTACK
 	swapgs
 	jmp	.Lnmileave
 
@@ -150,6 +158,7 @@ IDTVEC(trap02)
 	movq	%rsp,%rdi
 	incq	CPUVAR(NTRAP)
 	call	_C_LABEL(nmitrap)
+	SVS_LEAVE_ALTSTACK
 
 .Lnmileave:
 	INTR_RESTORE_GPRS
@@ -221,6 +230,7 @@ IDTVEC(trap08)
 	TRAP_NJ(T_DOUBLEFLT)
 	subq	$TF_REGSIZE,%rsp
 	INTR_SAVE_GPRS
+	SVS_ENTER_ALTSTACK
 	testb	$SEL_UPL,TF_CS(%rsp)
 	jz	1f
 	swapgs
@@ -235,6 +245,7 @@ IDTVEC(trap08)
 	incq	CPUVAR(NTRAP)
 	call	_C_LABEL(doubletrap)
 
+	SVS_LEAVE_ALTSTACK
 	INTR_RESTORE_GPRS
 
 	testb	$SEL_UPL,TF_CS(%rsp)
@@ -260,22 +271,22 @@ IDTVEC_END(trap10)
  * equivalent of iret, if it does this code would be needed
  * in order to copy the user segment registers into the fault frame.
  */
-#define check_swapgs alltraps
+#define kernuser_reenter alltraps
 #endif
 
 IDTVEC(trap11)		/* #NP() Segment not present */
 	TRAP_NJ(T_SEGNPFLT)
-	jmp	check_swapgs
+	jmp	kernuser_reenter
 IDTVEC_END(trap11)
 
 IDTVEC(trap12)		/* #SS() Stack exception */
 	TRAP_NJ(T_STKFLT)
-	jmp	check_swapgs
+	jmp	kernuser_reenter
 IDTVEC_END(trap12)
 
 IDTVEC(trap13)		/* #GP() General protection */
 	TRAP_NJ(T_PROTFLT)
-	jmp	check_swapgs
+	jmp	kernuser_reenter
 IDTVEC_END(trap13)
 
 IDTVEC(trap14)
@@ -352,68 +363,135 @@ IDTVEC(intrspurious)
 	jmp	.Lalltraps_checkusr
 IDTVEC_END(intrspurious)
 
-
+#ifndef kernuser_reenter
 /*
- * trap() calls here when it detects a fault in INTRFASTEXIT (loading the
- * segment registers or during the iret itself). The address of the (possibly
- * reconstructed) user trap frame is passed as an argument.
- *
- * Typically the code will have raised a SIGSEGV which will be actioned
- * by the code below.
+ * We need to worry about traps in kernel mode while the kernel %gs isn't
+ * loaded. When such traps happen, we have CPL=0 and %gs=userland, and we
+ * must perform an additional swapgs to get %gs=kernel.
  */
-	.type	_C_LABEL(trap_return_fault_return), @function
-LABEL(trap_return_fault_return)
-	mov	%rdi,%rsp		/* frame for user return */
-#ifdef DIAGNOSTIC
-	/* We can't recover the saved %rbx, so suppress warning */
-	movl	CPUVAR(ILEVEL),%ebx
-#endif
-	jmp	.Lalltraps_checkusr
-END(trap_return_fault_return)
 
-#ifndef check_swapgs
+#define TF_SMALL(val, reg)		(val - TF_REGSIZE)(reg)
+#define TF_SMALL_REGPUSHED(val, reg)	(val - (TF_REGSIZE - 8))(reg)
+
 /*
- * We need to worry about traps in kernel mode while the kernel %gs isn't
- * loaded. These are either faults on iretq during return to user or loads to
- * %gs.
+ * It is possible that we received a trap in kernel mode, but with the user
+ * context loaded. There are three cases where this can happen:
  *
- * When such traps happen, we have CPL=0 and %gs=userland, and we must perform
- * an additional swapgs to get %gs=kernel.
+ *  o Execution of IRETQ.
+ *  o Reload of ES.
+ *  o Reload of DS.
+ *
+ * When this happens, the kernel is re-entered in kernel mode, but the
+ * previous context is in kernel mode too.
+ *
+ * We have two iret frames in the stack. In the first one, we also pushed
+ * 'trapno' and 'err'. The 'rsp' field points to the outer iret frame:
+ *
+ * +---------------------------------------------------+
+ * | trapno | err | rip | cs=ring0 | rflags | rsp | ss |
+ * +-------------------------------------------|-------+
+ *                                             |
+ *           +---------------------------------+
+ *           |
+ *           |    +------------------------------------+
+ *           +--> | rip | cs=ring3 | rflags | rsp | ss |
+ *                +------------------------------------+
+ *
+ * We perform a three-step procedure:
+ *
+ *  o We update RSP to point to the outer frame. This outer frame is in the
+ *    same stack as the current frame, and likely just after the current
+ *    frame.
+ *
+ *  o We push, in this outer frame, the 'err' and 'trapno' fields of the
+ *    CURRENT frame.
+ *
+ *  o We do a normal INTRENTRY. Now that RSP points to the outer frame,
+ *    everything behaves as if we had received a trap from the outer frame,
+ *    that is to say, from userland directly.
+ *
+ * Finally, we jump to 'calltrap' and handle the trap smoothly.
+ *
+ * Two notes regarding SVS:
+ *
+ *  o With SVS, we will receive the trap while the user page tables are
+ *    loaded. That's not a problem, we don't touch anything unmapped here.
+ *
+ *  o With SVS, when the user page tables are loaded, the stack is really
+ *    small, and can contain only one trapframe structure. Therefore, in
+ *    intrfastexit, we must save the GPRs and pop their part of the stack
+ *    right away. If we weren't doing that, and the reload of ES faulted for
+ *    example, then the CPU would try to push an iret frame on the current
+ *    stack (nested), and would double-fault because it touches the redzone
+ *    below the stack (see the documentation in x86/x86/svs.c). By popping
+ *    the GPR part of the stack, we leave enough stack for the CPU to push
+ *    an iret frame, and for us to push one 8-byte register (%rdi) too.
  */
-NENTRY(check_swapgs)
-	INTRENTRY_L(3f,1:)
-2:
+	_ALIGN_TEXT
+LABEL(kernuser_reenter)
+	testb	$SEL_UPL,TF_SMALL(TF_CS, %rsp)
+	jz	.Lkernelmode
+
+.Lnormal_entry:
+	INTRENTRY
 	sti
 	jmp	calltrap
-3:
-	/*
-	 * Trap in kernel mode.
-	 */
+
+.Lkernelmode:
+	/* We will clobber %rdi */
+	pushq	%rdi
+
 	/* Case 1: fault on iretq? */
-	movq	TF_RIP(%rsp),%rax
-	cmpw	$0xcf48,(%rax)		/* Faulting instruction is iretq ? */
-	jne	5f			/* Jump if not */
-	movq	TF_RSP(%rsp),%rax	/* Must read %rsp, may be a pad word */
-	testb	$SEL_UPL,8(%rax)	/* Check %cs of outer iret frame */
-	je	2b			/* jump if iret was to kernel  */
-	jmp	1b			/* to user - must restore %gs */
+	leaq	do_iret(%rip),%rdi
+	cmpq	%rdi,TF_SMALL_REGPUSHED(TF_RIP, %rsp)
+	jne	5f
+	movq	TF_SMALL_REGPUSHED(TF_RSP, %rsp),%rdi	/* get %rsp */
+	testb	$SEL_UPL,8(%rdi)	/* check %cs of outer iret frame */
+	je	.Lnormal_entry		/* jump if iret was to kernel  */
+	jmp	.Lkernelmode_but_user	/* to user - must restore %gs */
 5:
 
-	/* Case 2: move to %gs? */
-	movw	(%rax),%ax
-	andb	$070,%ah		/* mask mod/rm from mod/reg/rm */
-	cmpw	$0x8e+050*256,%ax	/* Any move to %gs (reg 5) */
-	jne	2b			/* No - normal kernel fault */
-	jmp	1b			/* Yes - restore %gs */
-END(check_swapgs)
+	/* Case 2: move to %es? */
+	leaq	do_mov_es(%rip),%rdi
+	cmpq	%rdi,TF_SMALL_REGPUSHED(TF_RIP, %rsp)
+	je	.Lkernelmode_but_user
+
+	/* Case 3: move to %ds? */
+	leaq	do_mov_ds(%rip),%rdi
+	cmpq	%rdi,TF_SMALL_REGPUSHED(TF_RIP, %rsp)
+	je	.Lkernelmode_but_user
+
+	/* None of the above cases: normal kernel fault */
+	popq	%rdi
+	jmp	.Lnormal_entry
+
+.Lkernelmode_but_user:
+	/*
+	 * Here we have %rdi pushed on the stack, hence 8+.
+	 */
+	movq	%rsp,%rdi
+	movq	TF_SMALL_REGPUSHED(TF_RSP, %rsp),%rsp
+
+	/* Push tf_err and tf_trapno */
+	pushq	8+8(%rdi)	/* 8+8(%rdi) = current TF_ERR */
+	pushq	8+0(%rdi)	/* 8+0(%rdi) = current TF_TRAPNO */
+
+	/* Restore %rdi */
+	movq	(%rdi),%rdi
+
+	jmp	.Lnormal_entry
+END(kernuser_reenter)
 #endif
 
+	TEXT_USER_END
+
 /*
  * All traps go through here. Call the generic trap handler, and
  * check for ASTs afterwards.
  */
 NENTRY(alltraps)
 	INTRENTRY
+.Lalltraps_noentry:
 	STI(si)
 
 calltrap:

Index: src/sys/arch/amd64/amd64/db_machdep.c
diff -u src/sys/arch/amd64/amd64/db_machdep.c:1.4 src/sys/arch/amd64/amd64/db_machdep.c:1.4.30.1
--- src/sys/arch/amd64/amd64/db_machdep.c:1.4	Wed Oct  3 17:43:22 2012
+++ src/sys/arch/amd64/amd64/db_machdep.c	Thu Mar 22 16:59:03 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: db_machdep.c,v 1.4 2012/10/03 17:43:22 riastradh Exp $	*/
+/*	$NetBSD: db_machdep.c,v 1.4.30.1 2018/03/22 16:59:03 martin Exp $	*/
 
 /* 
  * Mach Operating System
@@ -26,7 +26,7 @@
  * rights to redistribute these changes.
  */
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: db_machdep.c,v 1.4 2012/10/03 17:43:22 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: db_machdep.c,v 1.4.30.1 2018/03/22 16:59:03 martin Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -213,11 +213,13 @@ db_frame_info(long *frame, db_addr_t cal
 		if (!strcmp(name, "trap")) {
 			*is_trap = TRAP;
 			narg = 0;
-		} else if (!strcmp(name, "syscall")) {
+		} else if (!strcmp(name, "syscall") ||
+		    !strcmp(name, "handle_syscall")) {
 			*is_trap = SYSCALL;
 			narg = 0;
 		} else if (name[0] == 'X') {
 			if (!strncmp(name, "Xintr", 5) ||
+			    !strncmp(name, "Xhandle", 7) ||
 			    !strncmp(name, "Xresume", 7) ||
 			    !strncmp(name, "Xstray", 6) ||
 			    !strncmp(name, "Xhold", 5) ||

Index: src/sys/arch/amd64/amd64/genassym.cf
diff -u src/sys/arch/amd64/amd64/genassym.cf:1.60.10.1 src/sys/arch/amd64/amd64/genassym.cf:1.60.10.2
--- src/sys/arch/amd64/amd64/genassym.cf:1.60.10.1	Tue Mar 13 15:47:44 2018
+++ src/sys/arch/amd64/amd64/genassym.cf	Thu Mar 22 16:59:03 2018
@@ -1,4 +1,4 @@
-#	$NetBSD: genassym.cf,v 1.60.10.1 2018/03/13 15:47:44 martin Exp $
+#	$NetBSD: genassym.cf,v 1.60.10.2 2018/03/22 16:59:03 martin Exp $
 
 #
 # Copyright (c) 1998, 2006, 2007, 2008 The NetBSD Foundation, Inc.
@@ -236,6 +236,13 @@ define	CPU_INFO_CURLDT		offsetof(struct 
 define	CPU_INFO_IDLELWP	offsetof(struct cpu_info, ci_data.cpu_idlelwp)
 define	CPU_INFO_PMAP		offsetof(struct cpu_info, ci_pmap)
 define	CPU_INFO_TSS		offsetof(struct cpu_info, ci_tss)
+ifdef SVS
+define	CPU_INFO_UPDIRPA	offsetof(struct cpu_info, ci_svs_updirpa)
+define	CPU_INFO_KPDIRPA	offsetof(struct cpu_info, ci_svs_kpdirpa)
+define	CPU_INFO_RSP0		offsetof(struct cpu_info, ci_svs_rsp0)
+define	CPU_INFO_URSP0		offsetof(struct cpu_info, ci_svs_ursp0)
+define	CPU_INFO_KRSP0		offsetof(struct cpu_info, ci_svs_krsp0)
+endif
 define	CPU_INFO_NSYSCALL	offsetof(struct cpu_info, ci_data.cpu_nsyscall)
 define	CPU_INFO_NTRAP		offsetof(struct cpu_info, ci_data.cpu_ntrap)
 define	CPU_INFO_NINTR		offsetof(struct cpu_info, ci_data.cpu_nintr)

Index: src/sys/arch/amd64/amd64/locore.S
diff -u src/sys/arch/amd64/amd64/locore.S:1.123.6.4 src/sys/arch/amd64/amd64/locore.S:1.123.6.5
--- src/sys/arch/amd64/amd64/locore.S:1.123.6.4	Tue Mar 13 15:47:44 2018
+++ src/sys/arch/amd64/amd64/locore.S	Thu Mar 22 16:59:03 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: locore.S,v 1.123.6.4 2018/03/13 15:47:44 martin Exp $	*/
+/*	$NetBSD: locore.S,v 1.123.6.5 2018/03/22 16:59:03 martin Exp $	*/
 
 /*
  * Copyright-o-rama!
@@ -160,6 +160,7 @@
 #include "opt_compat_netbsd32.h"
 #include "opt_compat_ibcs2.h"
 #include "opt_xen.h"
+#include "opt_svs.h"
 
 #include "assym.h"
 #include "lapic.h"
@@ -329,6 +330,9 @@
 	.globl	_C_LABEL(bootinfo)
 	.globl	_C_LABEL(biosbasemem)
 	.globl	_C_LABEL(biosextmem)
+	.globl	do_mov_es
+	.globl	do_mov_ds
+	.globl	do_iret
 
 	.type	_C_LABEL(tablesize), @object
 _C_LABEL(tablesize):	.long	TABLESIZE
@@ -1080,6 +1084,16 @@ ENTRY(cpu_switchto)
 	movq	%rbp,PCB_RBP(%rax)
 skip_save:
 
+#ifdef SVS
+	pushq	%rdx
+	movb	_C_LABEL(svs_enabled),%dl
+	testb	%dl,%dl
+	jz	.Lskip_svs
+	callq	_C_LABEL(svs_lwp_switch)
+.Lskip_svs:
+	popq	%rdx
+#endif
+
 	/* Switch to newlwp's stack. */
 	movq	L_PCB(%r12),%r14
 	movq	PCB_RSP(%r14),%rsp
@@ -1097,6 +1111,19 @@ skip_save:
 	jnz	switch_return
 
 	/* Switch ring0 stack */
+#ifdef SVS
+	movb	_C_LABEL(svs_enabled),%al
+	testb	%al,%al
+	jz	.Lno_svs_switch
+
+	movq	CPUVAR(RSP0),%rax
+	movq	CPUVAR(TSS),%rdi
+	movq	%rax,TSS_RSP0(%rdi)
+	jmp	.Lring0_switched
+
+.Lno_svs_switch:
+#endif
+
 #ifndef XEN
 	movq	PCB_RSP0(%r14),%rax
 	movq	CPUVAR(TSS),%rdi
@@ -1105,6 +1132,7 @@ skip_save:
 	movq	%r14,%rdi
 	callq	_C_LABEL(x86_64_switch_context);
 #endif
+.Lring0_switched:
 
 	/* Don't bother with the rest if switching to a system process. */
 	testl	$LW_SYSTEM,L_FLAG(%r12)
@@ -1223,74 +1251,12 @@ ENTRY(savectx)
 	ret
 END(savectx)
 
-IDTVEC(syscall32)
-	sysret		/* go away please */
-IDTVEC_END(syscall32)
-
 /*
- * syscall()
- *
- * syscall insn entry.
+ * Syscall handler.
  */
-IDTVEC(syscall)
-#ifndef XEN
-	/*
-	 * The user %rip is in %rcx and the user %rflags in %r11. The kernel %cs
-	 * and %ss are loaded, but nothing else is.
-	 *
-	 * The 'swapgs' instruction gives us access to cpu-specific memory where
-	 * we can save a user register and then read the LWP's kernel stack
-	 * pointer.
-	 *
-	 * This code doesn't seem to set %ds, this may not matter since it is
-	 * ignored in 64bit mode, OTOH the syscall instruction sets %ss and that
-	 * is ignored as well.
-	 */
-	swapgs
-	movq	%r15,CPUVAR(SCRATCH)
-	movq	CPUVAR(CURLWP),%r15
-	movq	L_PCB(%r15),%r15
-	movq	PCB_RSP0(%r15),%r15	/* LWP's kernel stack pointer */
-
-	/* Make stack look like an 'int nn' frame */
-#define SP(x)	(x)-(TF_SS+8)(%r15)
-	movq	$(LSEL(LUDATA_SEL, SEL_UPL)),SP(TF_SS)	/* user %ss */
-	movq	%rsp,SP(TF_RSP)				/* user %rsp */
-	movq	%r11,SP(TF_RFLAGS)			/* user %rflags */
-	movq	$(LSEL(LUCODE_SEL, SEL_UPL)),SP(TF_CS)	/* user %cs */
-	movq	%rcx,SP(TF_RIP)				/* user %rip */
-
-	leaq	SP(0),%rsp		/* %rsp now valid after frame */
-	movq	CPUVAR(SCRATCH),%r15
-#undef SP
-
-	movq	$2,TF_ERR(%rsp)		/* syscall instruction size */
-	movq	$T_ASTFLT,TF_TRAPNO(%rsp)
-
-	movw	%es,TF_ES(%rsp)
-	sti
-	INTR_SAVE_GPRS
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	$(LSEL(LUDATA_SEL, SEL_UPL)),TF_DS(%rsp)
-#else
-	/* Xen already switched to kernel stack */
-	pushq	%rsi
+NENTRY(handle_syscall)
 	STI(si)
-	popq	%rsi
-	addq	$0x10,%rsp	/* gap to match cs:rip */
-	pushq	$2		/* error code */
-	pushq	$T_ASTFLT
-	subq	$TF_REGSIZE,%rsp
-	INTR_SAVE_GPRS
-	cld
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	$(LSEL(LUDATA_SEL, SEL_UPL)),TF_DS(%rsp)
-#endif
 
-do_syscall:
 	movq	CPUVAR(CURLWP),%r14
 	incq	CPUVAR(NSYSCALL)	/* count it atomically */
 	movq	%rsp,L_MD_REGS(%r14)	/* save pointer to frame */
@@ -1315,32 +1281,18 @@ do_syscall:
 	jne	spl_error
 #endif
 
+	/*
+	 * Decide if we need to take a slow path. That's the case when we
+	 * want to reload %cs and %ss on a 64bit LWP (MDL_IRET set), or when
+	 * we're returning to a 32bit LWP (MDL_COMPAT32 set).
+	 *
+	 * In either case, we jump into intrfastexit and return to userland
+	 * with the iret instruction.
+	 */
 	testl	$(MDL_IRET|MDL_COMPAT32),L_MD_FLAGS(%r14)
-	INTR_RESTORE_GPRS
-	movw	TF_ES(%rsp),%es
-	movw	TF_DS(%rsp),%ds
-	SWAPGS
-	jnz	2f
-#ifndef XEN
-	movq	TF_RIP(%rsp),%rcx	/* %rip for sysret */
-	movq	TF_RFLAGS(%rsp),%r11	/* %flags for sysret */
-	movq	TF_RSP(%rsp),%rsp
-	sysretq
-#else
-	addq	$TF_RIP,%rsp
-	pushq	$256	/* VGCF_IN_SYSCALL */
-	jmp	HYPERVISOR_iret
-#endif
+	jnz	intrfastexit
 
-/*
- * If the syscall might have modified some registers, or we are a 32bit
- * process we must return to user with an 'iret' instruction.
- * If the iret faults in kernel (assumed due to illegal register values)
- * then a SIGSEGV will be signalled.
- */
-2:
-	addq	$TF_RIP,%rsp
-	iretq
+	jmp	syscall_sysret
 
 #ifdef DIAGNOSTIC
 	/* Report SPL error */
@@ -1372,7 +1324,7 @@ spl_error:
 	movq	%rsp,%rdi
 	call	_C_LABEL(trap)
 	jmp	.Lsyscall_checkast	/* re-check ASTs */
-IDTVEC_END(syscall)
+END(handle_syscall)
 
 /*
  * void lwp_trampoline(void);
@@ -1392,10 +1344,96 @@ NENTRY(lwp_trampoline)
 END(lwp_trampoline)
 
 /*
+ * Entry points of the 'syscall' instruction, 64bit and 32bit mode.
+ */
+
+#define SP(x)	(x)-(TF_SS+8)(%rax)
+
+.macro	SYSCALL_ENTRY	name,is_svs
+IDTVEC(\name)
+#ifndef XEN
+	/*
+	 * The user %rip is in %rcx and the user %rflags in %r11. The kernel %cs
+	 * and %ss are loaded, but nothing else is.
+	 *
+	 * The 'swapgs' instruction gives us access to cpu-specific memory where
+	 * we can save a user register and then read the LWP's kernel stack
+	 * pointer.
+	 *
+	 * This code doesn't seem to set %ds, this may not matter since it is
+	 * ignored in 64bit mode, OTOH the syscall instruction sets %ss and that
+	 * is ignored as well.
+	 */
+	swapgs
+
+	/* Get the LWP's kernel stack pointer in %rax */
+	.if	\is_svs
+		movq	%rax,SVS_UTLS+UTLS_SCRATCH
+		movq	SVS_UTLS+UTLS_RSP0,%rax
+	.else
+		movq	%rax,CPUVAR(SCRATCH)
+		movq	CPUVAR(CURLWP),%rax
+		movq	L_PCB(%rax),%rax
+		movq	PCB_RSP0(%rax),%rax
+	.endif
+
+	/* Make stack look like an 'int nn' frame */
+	movq	$(LSEL(LUDATA_SEL, SEL_UPL)),SP(TF_SS)	/* user %ss */
+	movq	%rsp,SP(TF_RSP)				/* user %rsp */
+	movq	%r11,SP(TF_RFLAGS)			/* user %rflags */
+	movq	$(LSEL(LUCODE_SEL, SEL_UPL)),SP(TF_CS)	/* user %cs */
+	movq	%rcx,SP(TF_RIP)				/* user %rip */
+	leaq	SP(0),%rsp		/* %rsp now valid after frame */
+
+	/* Restore %rax */
+	.if	\is_svs
+		movq	SVS_UTLS+UTLS_SCRATCH,%rax
+	.else
+		movq	CPUVAR(SCRATCH),%rax
+	.endif
+
+	movq	$2,TF_ERR(%rsp)		/* syscall instruction size */
+	movq	$T_ASTFLT,TF_TRAPNO(%rsp)
+#else
+	/* Xen already switched to kernel stack */
+	addq	$0x10,%rsp	/* gap to match cs:rip */
+	pushq	$2		/* error code */
+	pushq	$T_ASTFLT
+	subq	$TF_REGSIZE,%rsp
+	cld
+#endif
+	INTR_SAVE_GPRS
+	movw	$(LSEL(LUDATA_SEL, SEL_UPL)),TF_DS(%rsp)
+	movw	%es,TF_ES(%rsp)
+	movw	%fs,TF_FS(%rsp)
+	movw	%gs,TF_GS(%rsp)
+	.if	\is_svs
+		SVS_ENTER
+	.endif
+	jmp	handle_syscall
+IDTVEC_END(\name)
+.endm
+
+SYSCALL_ENTRY	syscall,is_svs=0
+
+	TEXT_USER_BEGIN
+
+#ifdef SVS
+SYSCALL_ENTRY	syscall_svs,is_svs=1
+#endif
+
+IDTVEC(syscall32)
+	sysret		/* go away please */
+IDTVEC_END(syscall32)
+
+	TEXT_USER_END
+
+/*
  * osyscall()
  *
  * Trap gate entry for int $80 syscall, also used by sigreturn.
  */
+	TEXT_USER_BEGIN
 IDTVEC(osyscall)
 #ifdef XEN
 	movq (%rsp),%rcx
@@ -1405,9 +1443,37 @@ IDTVEC(osyscall)
 	pushq	$2		/* size of instruction for restart */
 	pushq	$T_ASTFLT	/* trap # for doing ASTs */
 	INTRENTRY
-	STI(si)
-	jmp	do_syscall
+	jmp	handle_syscall
 IDTVEC_END(osyscall)
+	TEXT_USER_END
+
+/*
+ * Return to userland via 'sysret'.
+ */
+	TEXT_USER_BEGIN
+	_ALIGN_TEXT
+LABEL(syscall_sysret)
+	SVS_LEAVE
+
+	/* Set default the 64bit values in %ds and %es. */
+	movq	$GSEL(GUDATA_SEL, SEL_UPL),%rax
+	movw	%ax,%ds
+	movw	%ax,%es
+
+	INTR_RESTORE_GPRS
+	SWAPGS
+#ifndef XEN
+	movq	TF_RIP(%rsp),%rcx	/* %rip for sysret */
+	movq	TF_RFLAGS(%rsp),%r11	/* %flags for sysret */
+	movq	TF_RSP(%rsp),%rsp
+	sysretq
+#else
+	addq	$TF_RIP,%rsp
+	pushq	$256	/* VGCF_IN_SYSCALL */
+	jmp	HYPERVISOR_iret
+#endif
+END(syscall_sysret)
+	TEXT_USER_END
 
 /*
  * bool sse2_idlezero_page(void *pg)
@@ -1451,7 +1517,6 @@ END(sse2_idlezero_page)
  *
  * Zero a page without polluting the cache.
  */
-
 ENTRY(pagezero)
 	movq	$-PAGE_SIZE,%rdx
 	subq	%rdx,%rdi
@@ -1471,15 +1536,92 @@ ENTRY(pagezero)
 	ret
 END(pagezero)
 
+	TEXT_USER_BEGIN
+
+/*
+ * In intrfastexit, we advance %rsp at the beginning. We then access the
+ * segment registers in the trapframe with TF_BACKW (backwards). See the
+ * documentation in amd64_trap.S for an explanation.
+ */
+
+#define TF_BACKW(val, reg)	(val - (TF_REGSIZE+16))(reg)
+
 	_ALIGN_TEXT
 LABEL(intrfastexit)
-	INTR_RESTORE_GPRS
-	testq	$SEL_UPL,TF_CS(%rsp)
-	je	99f
 	NOT_XEN(cli;)
-	movw	TF_ES(%rsp),%es
-	movw	TF_DS(%rsp),%ds
+	SVS_LEAVE
+	INTR_RESTORE_GPRS
+	addq	$(TF_REGSIZE+16),%rsp	/* iret frame */
+
+	testb	$SEL_UPL,TF_BACKW(TF_CS, %rsp)
+	jz	.Lkexit
 	SWAPGS
-99:	addq	$TF_REGSIZE+16,%rsp
+do_mov_es:
+	movw	TF_BACKW(TF_ES, %rsp),%es
+do_mov_ds:
+	movw	TF_BACKW(TF_DS, %rsp),%ds
+
+.Lkexit:
+do_iret:
 	iretq
 END(intrfastexit)
+
+	TEXT_USER_END
+
+#ifdef SVS
+	.globl	svs_enter, svs_enter_end
+	.globl	svs_enter_altstack, svs_enter_altstack_end
+	.globl	svs_leave, svs_leave_end
+	.globl	svs_leave_altstack, svs_leave_altstack_end
+	.globl	nosvs_enter, nosvs_enter_end
+	.globl	nosvs_enter_altstack, nosvs_enter_altstack_end
+	.globl	nosvs_leave, nosvs_leave_end
+	.globl	nosvs_leave_altstack, nosvs_leave_altstack_end
+
+LABEL(svs_enter)
+	movq	SVS_UTLS+UTLS_KPDIRPA,%rax
+	movq	%rax,%cr3
+	movq	CPUVAR(KRSP0),%rsp
+LABEL(svs_enter_end)
+
+LABEL(svs_enter_altstack)
+	testb	$SEL_UPL,TF_CS(%rsp)
+	jz	1234f
+	movq	SVS_UTLS+UTLS_KPDIRPA,%rax
+	movq	%rax,%cr3
+1234:
+LABEL(svs_enter_altstack_end)
+
+LABEL(svs_leave)
+	testb	$SEL_UPL,TF_CS(%rsp)
+	jz	1234f
+	movq	CPUVAR(URSP0),%rsp
+	movq	CPUVAR(UPDIRPA),%rax
+	movq	%rax,%cr3
+1234:
+LABEL(svs_leave_end)
+
+LABEL(svs_leave_altstack)
+	testb	$SEL_UPL,TF_CS(%rsp)
+	jz	1234f
+	movq	CPUVAR(UPDIRPA),%rax
+	movq	%rax,%cr3
+1234:
+LABEL(svs_leave_altstack_end)
+
+LABEL(nosvs_enter)
+	NOSVS_ENTER
+LABEL(nosvs_enter_end)
+
+LABEL(nosvs_enter_altstack)
+	NOSVS_ENTER_ALTSTACK
+LABEL(nosvs_enter_altstack_end)
+
+LABEL(nosvs_leave)
+	NOSVS_LEAVE
+LABEL(nosvs_leave_end)
+
+LABEL(nosvs_leave_altstack)
+	NOSVS_LEAVE_ALTSTACK
+LABEL(nosvs_leave_altstack_end)
+#endif

Index: src/sys/arch/amd64/amd64/machdep.c
diff -u src/sys/arch/amd64/amd64/machdep.c:1.255.6.5 src/sys/arch/amd64/amd64/machdep.c:1.255.6.6
--- src/sys/arch/amd64/amd64/machdep.c:1.255.6.5	Fri Mar 16 13:17:56 2018
+++ src/sys/arch/amd64/amd64/machdep.c	Thu Mar 22 16:59:03 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: machdep.c,v 1.255.6.5 2018/03/16 13:17:56 martin Exp $	*/
+/*	$NetBSD: machdep.c,v 1.255.6.6 2018/03/22 16:59:03 martin Exp $	*/
 
 /*-
  * Copyright (c) 1996, 1997, 1998, 2000, 2006, 2007, 2008, 2011
@@ -111,7 +111,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.255.6.5 2018/03/16 13:17:56 martin Exp $");
+__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.255.6.6 2018/03/22 16:59:03 martin Exp $");
 
 /* #define XENDEBUG_LOW  */
 
@@ -123,6 +123,7 @@ __KERNEL_RCSID(0, "$NetBSD: machdep.c,v 
 #include "opt_mtrr.h"
 #include "opt_realmem.h"
 #include "opt_xen.h"
+#include "opt_svs.h"
 #ifndef XEN
 #include "opt_physmem.h"
 #endif
@@ -1544,6 +1545,9 @@ init_x86_64(paddr_t first_avail)
 #endif /* XEN */
 
 	cpu_probe(&cpu_info_primary);
+#ifdef SVS
+	svs_init();
+#endif
 	cpu_init_msrs(&cpu_info_primary, true);
 
 	use_pae = 1; /* PAE always enabled in long mode */

Index: src/sys/arch/amd64/amd64/trap.c
diff -u src/sys/arch/amd64/amd64/trap.c:1.96.4.1 src/sys/arch/amd64/amd64/trap.c:1.96.4.2
--- src/sys/arch/amd64/amd64/trap.c:1.96.4.1	Wed Mar  7 14:50:56 2018
+++ src/sys/arch/amd64/amd64/trap.c	Thu Mar 22 16:59:03 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: trap.c,v 1.96.4.1 2018/03/07 14:50:56 martin Exp $	*/
+/*	$NetBSD: trap.c,v 1.96.4.2 2018/03/22 16:59:03 martin Exp $	*/
 
 /*-
  * Copyright (c) 1998, 2000 The NetBSD Foundation, Inc.
@@ -68,7 +68,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: trap.c,v 1.96.4.1 2018/03/07 14:50:56 martin Exp $");
+__KERNEL_RCSID(0, "$NetBSD: trap.c,v 1.96.4.2 2018/03/22 16:59:03 martin Exp $");
 
 #include "opt_ddb.h"
 #include "opt_kgdb.h"
@@ -266,9 +266,6 @@ trap(struct trapframe *frame)
 	extern char fusuintrfailure[], kcopy_fault[];
 	extern char IDTVEC(osyscall)[];
 	extern char IDTVEC(syscall32)[];
-#ifndef XEN
-	struct trapframe *vframe;
-#endif
 	ksiginfo_t ksi;
 	void *onfault;
 	int type, error;
@@ -357,92 +354,7 @@ copyfault:
 			return;
 		}
 
-		/*
-		 * Check for failure during return to user mode.
-		 * This can happen loading invalid values into the segment
-		 * registers, or during the 'iret' itself.
-		 *
-		 * We do this by looking at the instruction we faulted on.
-		 * The specific instructions we recognize only happen when
-		 * returning from a trap, syscall, or interrupt.
-		 */
-
-kernelfault:
-#ifdef XEN
-		/*
-		 * XXX: there has to be an equivalent 'problem'
-		 * but I (dsl) don't know exactly what happens!
-		 * For now panic the kernel.
-		 */
 		goto we_re_toast;
-#else
-		KSI_INIT_TRAP(&ksi);
-		ksi.ksi_signo = SIGSEGV;
-		ksi.ksi_code = SEGV_ACCERR;
-		ksi.ksi_trap = type;
-
-		/* Get %rsp value before fault - there may be a pad word
-		 * below the trap frame. */
-		vframe = (void *)frame->tf_rsp;
-		if (frame->tf_rip == 0) {
-			/*
-			 * Assume that if we jumped to null we
-			 * probably did it via a null function
-			 * pointer, so print the return address.
-			 */
-			printf("kernel jumped to null; return addr was %p\n",
-			       *(void **)frame->tf_rsp);
-			goto we_re_toast;
-		}
-		switch (*(uint16_t *)frame->tf_rip) {
-		case 0xcf48:	/* iretq */
-			/*
-			 * The 'iretq' instruction faulted, so we have the
-			 * 'user' registers saved after the kernel
-			 * %rip:%cs:%fl:%rsp:%ss of the iret, and below that
-			 * the user %rip:%cs:%fl:%rsp:%ss the 'iret' was
-			 * processing.
-			 * We must copy the user register back over the
-			 * kernel fault frame to generate a normal stack
-			 * frame (eg for sending a SIGSEGV).
-			 */
-			vframe = (void *)((char *)vframe
-			    - offsetof(struct trapframe, tf_rip));
-			memmove(vframe, frame,
-			    offsetof(struct trapframe, tf_rip));
-			/* Set the faulting address to the user %eip */
-			ksi.ksi_addr = (void *)vframe->tf_rip;
-			break;
-		case 0x848e:	/* mov 0xa8(%rsp),%es (8e 84 24 a8 00 00 00) */
-		case 0x9c8e:	/* mov 0xb0(%rsp),%ds (8e 9c 24 b0 00 00 00) */
-#ifdef USER_LDT
-		case 0xa48e:	/* mov 0xa0(%rsp),%fs (8e a4 24 a0 00 00 00) */
-		case 0xac8e:	/* mov 0x98(%rsp),%gs (8e ac 24 98 00 00 00) */
-#endif
-			/*
-			 * We faulted loading one of the user segment registers.
-			 * The stack frame containing the user registers is
-			 * still valid and pointed to by tf_rsp.
-			 * Maybe we should check the iretq follows.
-			 */
-			if (KERNELMODE(vframe->tf_cs, vframe->tf_eflags))
-				goto we_re_toast;
-			/* There is no valid address for the fault */
-			break;
-
-		default:
-			goto we_re_toast;
-		}
-
-		/* XXX: worry about on-stack trampolines for nested
-		 * handlers?? */
-		/* Save outer frame for any signal return */
-		l->l_md.md_regs = vframe;
-		(*p->p_emul->e_trapsignal)(l, &ksi);
-		/* Return to user by reloading the user frame */
-		trap_return_fault_return(vframe);
-		/* NOTREACHED */
-#endif
 
 	case T_PROTFLT|T_USER:		/* protection fault */
 #if defined(COMPAT_NETBSD32) && defined(COMPAT_10)
@@ -700,7 +612,7 @@ faultcommon:
 				goto copyfault;
 			printf("uvm_fault(%p, 0x%lx, %d) -> %x\n",
 			    map, va, ftype, error);
-			goto kernelfault;
+			goto we_re_toast;
 		}
 
 		KSI_INIT_TRAP(&ksi);

Index: src/sys/arch/amd64/amd64/vector.S
diff -u src/sys/arch/amd64/amd64/vector.S:1.49.2.1 src/sys/arch/amd64/amd64/vector.S:1.49.2.2
--- src/sys/arch/amd64/amd64/vector.S:1.49.2.1	Wed Mar  7 14:50:56 2018
+++ src/sys/arch/amd64/amd64/vector.S	Thu Mar 22 16:59:03 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: vector.S,v 1.49.2.1 2018/03/07 14:50:56 martin Exp $	*/
+/*	$NetBSD: vector.S,v 1.49.2.2 2018/03/22 16:59:03 martin Exp $	*/
 
 /*-
  * Copyright (c) 1998, 2007, 2008 The NetBSD Foundation, Inc.
@@ -114,10 +114,7 @@ IDTVEC(recurse_lapic_ipi)
 	INTRENTRY
 	jmp	1f
 IDTVEC_END(recurse_lapic_ipi)
-IDTVEC(intr_x2apic_ipi)
-	pushq	$0
-	pushq	$T_ASTFLT
-	INTRENTRY
+IDTVEC(handle_x2apic_ipi)
 	movl	$(MSR_X2APIC_BASE + MSR_X2APIC_EOI),%ecx
 	xorl	%eax,%eax
 	xorl	%edx,%edx
@@ -126,17 +123,14 @@ IDTVEC(intr_x2apic_ipi)
 	cmpl	$IPL_HIGH,%ebx
 	jae	2f
 	jmp	1f
-IDTVEC_END(intr_x2apic_ipi)
-IDTVEC(intr_lapic_ipi)
-	pushq	$0
-	pushq	$T_ASTFLT
-	INTRENTRY
+IDTVEC_END(handle_x2apic_ipi)
+IDTVEC(handle_lapic_ipi)
 	movq	_C_LABEL(local_apic_va),%rbx
 	movl	$0,LAPIC_EOI(%rbx)
 	movl	CPUVAR(ILEVEL),%ebx
 	cmpl	$IPL_HIGH,%ebx
 	jae	2f
-IDTVEC_END(intr_lapic_ipi)
+IDTVEC_END(handle_lapic_ipi)
 IDTVEC(resume_lapic_ipi)
 1:
 	incl	CPUVAR(IDEPTH)
@@ -150,12 +144,23 @@ IDTVEC(resume_lapic_ipi)
 	INTRFASTEXIT
 IDTVEC_END(resume_lapic_ipi)
 
-#if defined(DDB)
-IDTVEC(intrddb)
-1:
+	TEXT_USER_BEGIN
+IDTVEC(intr_x2apic_ipi)
 	pushq	$0
-	pushq	$T_BPTFLT
+	pushq	$T_ASTFLT
+	INTRENTRY
+	jmp	_C_LABEL(Xhandle_x2apic_ipi)
+IDTVEC_END(intr_x2apic_ipi)
+IDTVEC(intr_lapic_ipi)
+	pushq	$0
+	pushq	$T_ASTFLT
 	INTRENTRY
+	jmp	_C_LABEL(Xhandle_lapic_ipi)
+IDTVEC_END(intr_lapic_ipi)
+	TEXT_USER_END
+
+#if defined(DDB)
+IDTVEC(handle_intrddbipi)
 	movl	$0xf,%eax
 	movq	%rax,%cr8
 	movq	_C_LABEL(local_apic_va),%rbx
@@ -165,13 +170,8 @@ IDTVEC(intrddb)
 	xorl	%eax,%eax
 	movq	%rax,%cr8
 	INTRFASTEXIT
-IDTVEC_END(intrddb)
-
-IDTVEC(x2apic_intrddb)
-1:
-	pushq	$0
-	pushq	$T_BPTFLT
-	INTRENTRY
+IDTVEC_END(handle_intrddbipi)
+IDTVEC(handle_x2apic_intrddbipi)
 	movl	$0xf,%eax
 	movq	%rax,%cr8
 	movl	$(MSR_X2APIC_BASE + MSR_X2APIC_EOI),%ecx
@@ -183,7 +183,23 @@ IDTVEC(x2apic_intrddb)
 	xorl	%eax,%eax
 	movq	%rax,%cr8
 	INTRFASTEXIT
+IDTVEC_END(handle_x2apic_intrddbipi)
+
+	TEXT_USER_BEGIN
+IDTVEC(intrddb)
+	pushq	$0
+	pushq	$T_BPTFLT
+	INTRENTRY
+	jmp	_C_LABEL(Xhandle_intrddbipi)
+IDTVEC_END(intrddb)
+IDTVEC(x2apic_intrddb)
+	pushq	$0
+	pushq	$T_BPTFLT
+	INTRENTRY
+	jmp	_C_LABEL(Xhandle_x2apic_intrddbipi)
 IDTVEC_END(x2apic_intrddb)
+	TEXT_USER_END
+
 #endif /* DDB */
 #endif /* MULTIPROCESSOR */
 
@@ -197,10 +213,7 @@ IDTVEC(recurse_lapic_ltimer)
 	INTRENTRY
 	jmp	1f
 IDTVEC_END(recurse_lapic_ltimer)
-IDTVEC(intr_x2apic_ltimer)
-	pushq	$0
-	pushq	$T_ASTFLT
-	INTRENTRY
+IDTVEC(handle_x2apic_ltimer)
 	movl	$(MSR_X2APIC_BASE + MSR_X2APIC_EOI),%ecx
 	xorl	%eax,%eax
 	xorl	%edx,%edx
@@ -209,11 +222,8 @@ IDTVEC(intr_x2apic_ltimer)
 	cmpl	$IPL_CLOCK,%ebx
 	jae	2f
 	jmp	1f
-IDTVEC_END(intr_x2apic_ltimer)
-IDTVEC(intr_lapic_ltimer)
-	pushq	$0
-	pushq	$T_ASTFLT
-	INTRENTRY
+IDTVEC_END(handle_x2apic_ltimer)
+IDTVEC(handle_lapic_ltimer)
 	movq	_C_LABEL(local_apic_va),%rbx
 	movl	$0,LAPIC_EOI(%rbx)
 	movl	CPUVAR(ILEVEL),%ebx
@@ -234,33 +244,57 @@ IDTVEC(resume_lapic_ltimer)
 	orl	$(1 << LIR_TIMER),CPUVAR(IPENDING)
 	INTRFASTEXIT
 IDTVEC_END(resume_lapic_ltimer)
+
+	TEXT_USER_BEGIN
+IDTVEC(intr_x2apic_ltimer)
+	pushq	$0
+	pushq	$T_ASTFLT
+	INTRENTRY
+	jmp	_C_LABEL(Xhandle_x2apic_ltimer)
+IDTVEC_END(intr_x2apic_ltimer)
+IDTVEC(intr_lapic_ltimer)
+	pushq	$0
+	pushq	$T_ASTFLT
+	INTRENTRY
+	jmp	_C_LABEL(Xhandle_lapic_ltimer)
+IDTVEC_END(intr_lapic_ltimer)
+	TEXT_USER_END
+
 #endif /* NLAPIC > 0 */
 
 #ifndef XEN
 /*
  * TLB shootdown handler.
  */
-IDTVEC(intr_lapic_tlb)
-	pushq	$0
-	pushq	$T_ASTFLT
-	INTRENTRY
+IDTVEC(handle_lapic_tlb)
 	movq	_C_LABEL(local_apic_va),%rax
 	movl	$0,LAPIC_EOI(%rax)
 	callq	_C_LABEL(pmap_tlb_intr)
 	INTRFASTEXIT
-IDTVEC_END(intr_lapic_tlb)
-
-IDTVEC(intr_x2apic_tlb)
-	pushq	$0
-	pushq	$T_ASTFLT
-	INTRENTRY
+IDTVEC_END(handle_lapic_tlb)
+IDTVEC(handle_x2apic_tlb)
 	movl	$(MSR_X2APIC_BASE + MSR_X2APIC_EOI),%ecx
 	xorl	%eax,%eax
 	xorl	%edx,%edx
 	wrmsr
 	callq	_C_LABEL(pmap_tlb_intr)
 	INTRFASTEXIT
+IDTVEC_END(handle_x2apic_tlb)
+
+	TEXT_USER_BEGIN
+IDTVEC(intr_lapic_tlb)
+	pushq	$0
+	pushq	$T_ASTFLT
+	INTRENTRY
+	jmp	_C_LABEL(Xhandle_lapic_tlb)
+IDTVEC_END(intr_lapic_tlb)
+IDTVEC(intr_x2apic_tlb)
+	pushq	$0
+	pushq	$T_ASTFLT
+	INTRENTRY
+	jmp	_C_LABEL(Xhandle_x2apic_tlb)
 IDTVEC_END(intr_x2apic_tlb)
+	TEXT_USER_END
 
 #endif /* !XEN */
 
@@ -269,7 +303,7 @@ IDTVEC_END(intr_x2apic_tlb)
 #ifndef XEN
 
 /*
- * This macro defines the generic stub code. Its arguments modifiy it
+ * This macro defines the generic stub code. Its arguments modify it
  * for specific PICs.
  */
 
@@ -285,10 +319,7 @@ IDTVEC(resume_ ## name ## num)						\
 	movq	CPUVAR(ISOURCES) + (num) * 8,%r14			;\
 	movl	IS_MAXLEVEL(%r14),%ebx					;\
 	jmp	1f							;\
-IDTVEC(intr_ ## name ## num)						;\
-	pushq	$0			/* dummy error code */		;\
-	pushq	$T_ASTFLT		/* trap # for doing ASTs */	;\
-	INTRENTRY							;\
+IDTVEC(handle_ ## name ## num)						;\
 	movq	CPUVAR(ISOURCES) + (num) * 8,%r14			;\
 	mask(num)			/* mask it in hardware */	;\
 	early_ack(num)			/* and allow other intrs */	;\
@@ -339,7 +370,16 @@ IDTVEC(intr_ ## name ## num)						;\
 9:									\
 	unmask(num)							;\
 	late_ack(num)							;\
-	INTRFASTEXIT
+	INTRFASTEXIT							;\
+IDTVEC_END(handle_ ## name ## num)					;\
+	TEXT_USER_BEGIN							;\
+IDTVEC(intr_ ## name ## num)						;\
+	pushq	$0			/* dummy error code */		;\
+	pushq	$T_ASTFLT		/* trap # for doing ASTs */	;\
+	INTRENTRY							;\
+	jmp	_C_LABEL(Xhandle_ ## name ## num)			;\
+IDTVEC_END(intr_ ## name ## num)					;\
+	TEXT_USER_END
 
 #define ICUADDR IO_ICU1
 

Index: src/sys/arch/amd64/conf/GENERIC
diff -u src/sys/arch/amd64/conf/GENERIC:1.459.2.5 src/sys/arch/amd64/conf/GENERIC:1.459.2.6
--- src/sys/arch/amd64/conf/GENERIC:1.459.2.5	Sun Feb 11 21:17:34 2018
+++ src/sys/arch/amd64/conf/GENERIC	Thu Mar 22 16:59:03 2018
@@ -1,4 +1,4 @@
-# $NetBSD: GENERIC,v 1.459.2.5 2018/02/11 21:17:34 snj Exp $
+# $NetBSD: GENERIC,v 1.459.2.6 2018/03/22 16:59:03 martin Exp $
 #
 # GENERIC machine description file
 #
@@ -22,7 +22,7 @@ include 	"arch/amd64/conf/std.amd64"
 
 options 	INCLUDE_CONFIG_FILE	# embed config file in kernel binary
 
-#ident		"GENERIC-$Revision: 1.459.2.5 $"
+#ident		"GENERIC-$Revision: 1.459.2.6 $"
 
 maxusers	64		# estimated number of users
 
@@ -73,6 +73,9 @@ options 	USERCONF	# userconf(4) support
 #options 	PIPE_SOCKETPAIR	# smaller, but slower pipe(2)
 options 	SYSCTL_INCLUDE_DESCR	# Include sysctl descriptions in kernel
 
+# CPU-related options
+#options 	SVS		# Separate Virtual Space
+
 # CPU features
 acpicpu*	at cpu?		# ACPI CPU (including frequency scaling)
 coretemp*	at cpu?		# Intel on-die thermal sensor

Index: src/sys/arch/amd64/conf/kern.ldscript
diff -u src/sys/arch/amd64/conf/kern.ldscript:1.22.6.2 src/sys/arch/amd64/conf/kern.ldscript:1.22.6.3
--- src/sys/arch/amd64/conf/kern.ldscript:1.22.6.2	Tue Mar  6 10:17:11 2018
+++ src/sys/arch/amd64/conf/kern.ldscript	Thu Mar 22 16:59:03 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: kern.ldscript,v 1.22.6.2 2018/03/06 10:17:11 martin Exp $	*/
+/*	$NetBSD: kern.ldscript,v 1.22.6.3 2018/03/22 16:59:03 martin Exp $	*/
 
 #include "assym.h"
 
@@ -15,6 +15,12 @@ SECTIONS
 {
 	.text :
 	{
+		. = ALIGN(__PAGE_SIZE);
+		__text_user_start = . ;
+		*(.text.user)
+		. = ALIGN(__PAGE_SIZE);
+		__text_user_end = . ;
+
 		*(.text)
 		*(.text.*)
 		*(.stub)

Index: src/sys/arch/amd64/include/frameasm.h
diff -u src/sys/arch/amd64/include/frameasm.h:1.20.32.1 src/sys/arch/amd64/include/frameasm.h:1.20.32.2
--- src/sys/arch/amd64/include/frameasm.h:1.20.32.1	Wed Mar  7 14:50:57 2018
+++ src/sys/arch/amd64/include/frameasm.h	Thu Mar 22 16:59:03 2018
@@ -1,10 +1,11 @@
-/*	$NetBSD: frameasm.h,v 1.20.32.1 2018/03/07 14:50:57 martin Exp $	*/
+/*	$NetBSD: frameasm.h,v 1.20.32.2 2018/03/22 16:59:03 martin Exp $	*/
 
 #ifndef _AMD64_MACHINE_FRAMEASM_H
 #define _AMD64_MACHINE_FRAMEASM_H
 
 #ifdef _KERNEL_OPT
 #include "opt_xen.h"
+#include "opt_svs.h"
 #endif
 
 /*
@@ -35,6 +36,19 @@
 #define STI(temp_reg) sti
 #endif	/* XEN */
 
+#define HP_NAME_SVS_ENTER	5
+#define HP_NAME_SVS_LEAVE	6
+#define HP_NAME_SVS_ENTER_ALT	7
+#define HP_NAME_SVS_LEAVE_ALT	8
+
+#define HOTPATCH(name, size) \
+123:						; \
+	.pushsection	.rodata.hotpatch, "a"	; \
+	.byte		name			; \
+	.byte		size			; \
+	.quad		123b			; \
+	.popsection
+
 #define	SWAPGS	NOT_XEN(swapgs)
 
 /*
@@ -74,21 +88,68 @@
 	movq	TF_RBX(%rsp),%rbx	; \
 	movq	TF_RAX(%rsp),%rax
 
-#define	INTRENTRY_L(kernel_trap, usertrap) \
+#define TEXT_USER_BEGIN	.pushsection	.text.user, "ax"
+#define TEXT_USER_END	.popsection
+
+#ifdef SVS
+
+/* XXX: put this somewhere else */
+#define SVS_UTLS		0xffffc00000000000 /* PMAP_PCPU_BASE */
+#define UTLS_KPDIRPA		0
+#define UTLS_SCRATCH		8
+#define UTLS_RSP0		16
+
+#define SVS_ENTER_BYTES	22
+#define NOSVS_ENTER \
+	.byte 0xEB, (SVS_ENTER_BYTES-2)	/* jmp */	; \
+	.fill	(SVS_ENTER_BYTES-2),1,0xCC
+#define SVS_ENTER \
+	HOTPATCH(HP_NAME_SVS_ENTER, SVS_ENTER_BYTES)	; \
+	NOSVS_ENTER
+
+#define SVS_LEAVE_BYTES	31
+#define NOSVS_LEAVE \
+	.byte 0xEB, (SVS_LEAVE_BYTES-2)	/* jmp */	; \
+	.fill	(SVS_LEAVE_BYTES-2),1,0xCC
+#define SVS_LEAVE \
+	HOTPATCH(HP_NAME_SVS_LEAVE, SVS_LEAVE_BYTES)	; \
+	NOSVS_LEAVE
+
+#define SVS_ENTER_ALT_BYTES	23
+#define NOSVS_ENTER_ALTSTACK \
+	.byte 0xEB, (SVS_ENTER_ALT_BYTES-2)	/* jmp */	; \
+	.fill	(SVS_ENTER_ALT_BYTES-2),1,0xCC
+#define SVS_ENTER_ALTSTACK \
+	HOTPATCH(HP_NAME_SVS_ENTER_ALT, SVS_ENTER_ALT_BYTES)	; \
+	NOSVS_ENTER_ALTSTACK
+
+#define SVS_LEAVE_ALT_BYTES	22
+#define NOSVS_LEAVE_ALTSTACK \
+	.byte 0xEB, (SVS_LEAVE_ALT_BYTES-2)	/* jmp */	; \
+	.fill	(SVS_LEAVE_ALT_BYTES-2),1,0xCC
+#define SVS_LEAVE_ALTSTACK \
+	HOTPATCH(HP_NAME_SVS_LEAVE_ALT, SVS_LEAVE_ALT_BYTES)	; \
+	NOSVS_LEAVE_ALTSTACK
+
+#else
+#define SVS_ENTER	/* nothing */
+#define SVS_LEAVE	/* nothing */
+#define SVS_ENTER_ALTSTACK	/* nothing */
+#define SVS_LEAVE_ALTSTACK	/* nothing */
+#endif
+
+#define	INTRENTRY \
 	subq	$TF_REGSIZE,%rsp	; \
 	INTR_SAVE_GPRS			; \
 	cld				; \
 	testb	$SEL_UPL,TF_CS(%rsp)	; \
-	je	kernel_trap		; \
-usertrap				; \
+	je	98f			; \
 	SWAPGS				; \
+	SVS_ENTER			; \
 	movw	%gs,TF_GS(%rsp)		; \
 	movw	%fs,TF_FS(%rsp)		; \
 	movw	%es,TF_ES(%rsp)		; \
-	movw	%ds,TF_DS(%rsp)	
-
-#define	INTRENTRY \
-	INTRENTRY_L(98f,)		; \
+	movw	%ds,TF_DS(%rsp)		; \
 98:
 
 #define INTRFASTEXIT \

Index: src/sys/arch/amd64/include/param.h
diff -u src/sys/arch/amd64/include/param.h:1.21.6.1 src/sys/arch/amd64/include/param.h:1.21.6.2
--- src/sys/arch/amd64/include/param.h:1.21.6.1	Fri Mar 16 13:17:56 2018
+++ src/sys/arch/amd64/include/param.h	Thu Mar 22 16:59:03 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: param.h,v 1.21.6.1 2018/03/16 13:17:56 martin Exp $	*/
+/*	$NetBSD: param.h,v 1.21.6.2 2018/03/22 16:59:03 martin Exp $	*/
 
 #ifdef __x86_64__
 
@@ -53,9 +53,9 @@
 #define	SSIZE		1		/* initial stack size/NBPG */
 #define	SINCR		1		/* increment of stack/NBPG */
 #ifdef DIAGNOSTIC
-#define	UPAGES		4		/* pages of u-area (1 for redzone) */
+#define	UPAGES		5		/* pages of u-area (1 for redzone) */
 #else
-#define	UPAGES		3		/* pages of u-area */
+#define	UPAGES		4		/* pages of u-area */
 #endif
 #define	USPACE		(UPAGES * NBPG)	/* total size of u-area */
 

Index: src/sys/arch/amd64/include/pmap.h
diff -u src/sys/arch/amd64/include/pmap.h:1.39 src/sys/arch/amd64/include/pmap.h:1.39.8.1
--- src/sys/arch/amd64/include/pmap.h:1.39	Fri Nov 11 12:06:31 2016
+++ src/sys/arch/amd64/include/pmap.h	Thu Mar 22 16:59:03 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: pmap.h,v 1.39 2016/11/11 12:06:31 maxv Exp $	*/
+/*	$NetBSD: pmap.h,v 1.39.8.1 2018/03/22 16:59:03 martin Exp $	*/
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -218,6 +218,12 @@
  */
 #define NPTECL		8
 
+void svs_pmap_sync(struct pmap *, int);
+void svs_lwp_switch(struct lwp *, struct lwp *);
+void svs_pdir_switch(struct pmap *);
+void svs_init(void);
+extern bool svs_enabled;
+
 #include <x86/pmap.h>
 
 #ifndef XEN

Index: src/sys/arch/x86/conf/files.x86
diff -u src/sys/arch/x86/conf/files.x86:1.88 src/sys/arch/x86/conf/files.x86:1.88.6.1
--- src/sys/arch/x86/conf/files.x86:1.88	Fri Mar 10 14:40:56 2017
+++ src/sys/arch/x86/conf/files.x86	Thu Mar 22 16:59:03 2018
@@ -1,4 +1,4 @@
-#	$NetBSD: files.x86,v 1.88 2017/03/10 14:40:56 maxv Exp $
+#	$NetBSD: files.x86,v 1.88.6.1 2018/03/22 16:59:03 martin Exp $
 
 # options for MP configuration through the MP spec
 defflag opt_mpbios.h MPBIOS MPVERBOSE MPDEBUG MPBIOS_SCANPCI
@@ -16,6 +16,8 @@ defflag opt_pcifixup.h	PCI_ADDR_FIXUP PC
 # To be able to test for NetBSD/xen in shared files
 defflag	opt_xen.h		DO_NOT_DEFINE
 
+defflag	SVS
+
 define  cpubus { [apid = -1] }
 define	cpufeaturebus {}
 define  ioapicbus { [apid = -1] }
@@ -90,6 +92,7 @@ file 	arch/x86/x86/pmap.c		machdep
 file 	arch/x86/x86/pmap_tlb.c		machdep
 file 	arch/x86/x86/pmc.c		machdep
 file	arch/x86/x86/procfs_machdep.c	procfs
+file 	arch/x86/x86/svs.c		machdep & svs
 file	arch/x86/x86/sys_machdep.c	machdep
 file	arch/x86/x86/syscall.c		machdep
 file	arch/x86/x86/tsc.c		machdep

Index: src/sys/arch/x86/include/cpu.h
diff -u src/sys/arch/x86/include/cpu.h:1.71.2.3 src/sys/arch/x86/include/cpu.h:1.71.2.4
--- src/sys/arch/x86/include/cpu.h:1.71.2.3	Fri Mar 16 13:17:56 2018
+++ src/sys/arch/x86/include/cpu.h	Thu Mar 22 16:59:04 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: cpu.h,v 1.71.2.3 2018/03/16 13:17:56 martin Exp $	*/
+/*	$NetBSD: cpu.h,v 1.71.2.4 2018/03/22 16:59:04 martin Exp $	*/
 
 /*-
  * Copyright (c) 1990 The Regents of the University of California.
@@ -47,6 +47,7 @@
 #if defined(_KERNEL) || defined(_KMEMUSER)
 #if defined(_KERNEL_OPT)
 #include "opt_xen.h"
+#include "opt_svs.h"
 #ifdef i386
 #include "opt_user_ldt.h"
 #include "opt_vm86.h"
@@ -197,6 +198,18 @@ struct cpu_info {
 	pd_entry_t *	ci_pae_l3_pdir; /* VA pointer to L3 PD */
 #endif
 
+#ifdef SVS
+	pd_entry_t *	ci_svs_updir;
+	paddr_t		ci_svs_updirpa;
+	paddr_t		ci_svs_kpdirpa;
+	kmutex_t	ci_svs_mtx;
+	pd_entry_t *	ci_svs_rsp0_pte;
+	vaddr_t		ci_svs_rsp0;
+	vaddr_t		ci_svs_ursp0;
+	vaddr_t		ci_svs_krsp0;
+	vaddr_t		ci_svs_utls;
+#endif
+
 #if defined(XEN) && (defined(PAE) || defined(__x86_64__))
 	/* Currently active user PGD (can't use rcr3() with Xen) */
 	pd_entry_t *	ci_kpm_pdir;	/* per-cpu PMD (va) */
@@ -342,6 +355,7 @@ void cpu_broadcast_halt(void);
 void cpu_kick(struct cpu_info *);
 
 void cpu_pcpuarea_init(struct cpu_info *);
+void cpu_svs_init(struct cpu_info *);
 
 #define	curcpu()		x86_curcpu()
 #define	curlwp			x86_curlwp()

Index: src/sys/arch/x86/include/pmap.h
diff -u src/sys/arch/x86/include/pmap.h:1.64.6.1 src/sys/arch/x86/include/pmap.h:1.64.6.2
--- src/sys/arch/x86/include/pmap.h:1.64.6.1	Fri Mar 16 13:17:56 2018
+++ src/sys/arch/x86/include/pmap.h	Thu Mar 22 16:59:04 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: pmap.h,v 1.64.6.1 2018/03/16 13:17:56 martin Exp $	*/
+/*	$NetBSD: pmap.h,v 1.64.6.2 2018/03/22 16:59:04 martin Exp $	*/
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -126,9 +126,13 @@ struct pcpu_entry {
 	uint8_t ist0[PAGE_SIZE];
 	uint8_t ist1[PAGE_SIZE];
 	uint8_t ist2[PAGE_SIZE];
+	uint8_t rsp0[2 * PAGE_SIZE];
 } __packed;
 
 struct pcpu_area {
+#ifdef SVS
+	uint8_t utls[PAGE_SIZE];
+#endif
 	uint8_t idt[PAGE_SIZE];
 	uint8_t ldt[PAGE_SIZE];
 	struct pcpu_entry ent[MAXCPUS];

Index: src/sys/arch/x86/x86/cpu.c
diff -u src/sys/arch/x86/x86/cpu.c:1.130.2.4 src/sys/arch/x86/x86/cpu.c:1.130.2.5
--- src/sys/arch/x86/x86/cpu.c:1.130.2.4	Fri Mar 16 13:17:56 2018
+++ src/sys/arch/x86/x86/cpu.c	Thu Mar 22 16:59:04 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: cpu.c,v 1.130.2.4 2018/03/16 13:17:56 martin Exp $	*/
+/*	$NetBSD: cpu.c,v 1.130.2.5 2018/03/22 16:59:04 martin Exp $	*/
 
 /*-
  * Copyright (c) 2000-2012 NetBSD Foundation, Inc.
@@ -62,12 +62,13 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.130.2.4 2018/03/16 13:17:56 martin Exp $");
+__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.130.2.5 2018/03/22 16:59:04 martin Exp $");
 
 #include "opt_ddb.h"
 #include "opt_mpbios.h"		/* for MPDEBUG */
 #include "opt_mtrr.h"
 #include "opt_multiprocessor.h"
+#include "opt_svs.h"
 
 #include "lapic.h"
 #include "ioapic.h"
@@ -410,6 +411,10 @@ cpu_attach(device_t parent, device_t sel
 		KASSERT(ci->ci_data.cpu_idlelwp != NULL);
 	}
 
+#ifdef SVS
+	cpu_svs_init(ci);
+#endif
+
 	pmap_reference(pmap_kernel());
 	ci->ci_pmap = pmap_kernel();
 	ci->ci_tlbstate = TLBSTATE_STALE;
@@ -597,6 +602,9 @@ cpu_init(struct cpu_info *ci)
 	 * hardware supports it.
 	 */
 	if (cpu_feature[0] & CPUID_PGE)
+#ifdef SVS
+		if (!svs_enabled)
+#endif
 		cr4 |= CR4_PGE;	/* enable global TLB caching */
 
 	/*
@@ -1071,7 +1079,7 @@ mp_cpu_start_cleanup(struct cpu_info *ci
 
 #ifdef __x86_64__
 typedef void (vector)(void);
-extern vector Xsyscall, Xsyscall32;
+extern vector Xsyscall, Xsyscall32, Xsyscall_svs;
 #endif
 
 void
@@ -1085,6 +1093,11 @@ cpu_init_msrs(struct cpu_info *ci, bool 
 	wrmsr(MSR_CSTAR, (uint64_t)Xsyscall32);
 	wrmsr(MSR_SFMASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
 
+#ifdef SVS
+	if (svs_enabled)
+		wrmsr(MSR_LSTAR, (uint64_t)Xsyscall_svs);
+#endif
+
 	if (full) {
 		wrmsr(MSR_FSBASE, 0);
 		wrmsr(MSR_GSBASE, (uint64_t)ci);
@@ -1245,6 +1258,10 @@ x86_cpu_idle_halt(void)
 void
 cpu_load_pmap(struct pmap *pmap, struct pmap *oldpmap)
 {
+#ifdef SVS
+	svs_pdir_switch(pmap);
+#endif
+
 #ifdef PAE
 	struct cpu_info *ci = curcpu();
 	bool interrupts_enabled;

Index: src/sys/arch/x86/x86/pmap.c
diff -u src/sys/arch/x86/x86/pmap.c:1.245.6.5 src/sys/arch/x86/x86/pmap.c:1.245.6.6
--- src/sys/arch/x86/x86/pmap.c:1.245.6.5	Fri Mar 16 13:17:56 2018
+++ src/sys/arch/x86/x86/pmap.c	Thu Mar 22 16:59:04 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: pmap.c,v 1.245.6.5 2018/03/16 13:17:56 martin Exp $	*/
+/*	$NetBSD: pmap.c,v 1.245.6.6 2018/03/22 16:59:04 martin Exp $	*/
 
 /*-
  * Copyright (c) 2008, 2010, 2016, 2017 The NetBSD Foundation, Inc.
@@ -171,12 +171,13 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.245.6.5 2018/03/16 13:17:56 martin Exp $");
+__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.245.6.6 2018/03/22 16:59:04 martin Exp $");
 
 #include "opt_user_ldt.h"
 #include "opt_lockdebug.h"
 #include "opt_multiprocessor.h"
 #include "opt_xen.h"
+#include "opt_svs.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -2051,31 +2052,30 @@ pmap_free_ptp(struct pmap *pmap, struct 
 	do {
 		index = pl_i(va, level + 1);
 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
-#if defined(XEN)
-#  if defined(__x86_64__)
+
 		/*
-		 * If ptp is a L3 currently mapped in kernel space,
-		 * on any cpu, clear it before freeing
+		 * On Xen-amd64 or SVS, we need to sync the top level page
+		 * directory on each CPU.
 		 */
+#if defined(XEN) && defined(__x86_64__)
 		if (level == PTP_LEVELS - 1) {
-			/*
-			 * Update the per-cpu PD on all cpus the current
-			 * pmap is active on
-			 */
 			xen_kpm_sync(pmap, index);
 		}
-#  endif /*__x86_64__ */
+#elif defined(SVS)
+		if (svs_enabled && level == PTP_LEVELS - 1) {
+			svs_pmap_sync(pmap, index);
+		}
+#endif
+
 		invaladdr = level == 1 ? (vaddr_t)ptes :
 		    (vaddr_t)pdes[level - 2];
 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
 		    opde, TLBSHOOT_FREE_PTP1);
+
+#if defined(XEN)
 		pmap_tlb_shootnow();
-#else	/* XEN */
-		invaladdr = level == 1 ? (vaddr_t)ptes :
-		    (vaddr_t)pdes[level - 2];
-		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
-		    opde, TLBSHOOT_FREE_PTP1);
-#endif	/* XEN */
+#endif
+
 		pmap_freepage(pmap, ptp, level);
 		if (level < PTP_LEVELS - 1) {
 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
@@ -2157,15 +2157,19 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t 
 		pa = VM_PAGE_TO_PHYS(ptp);
 		pmap_pte_set(&pva[index], (pd_entry_t)
 		    (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V));
+
+		/*
+		 * On Xen-amd64 or SVS, we need to sync the top level page
+		 * directory on each CPU.
+		 */
 #if defined(XEN) && defined(__x86_64__)
 		if (i == PTP_LEVELS) {
-
-			/*
-			 * Update the per-cpu PD on all cpus the current
-			 * pmap is active on
-			 */
 			xen_kpm_sync(pmap, index);
 		}
+#elif defined(SVS)
+		if (svs_enabled && i == PTP_LEVELS) {
+			svs_pmap_sync(pmap, index);
+		}
 #endif
 		pmap_pte_flush();
 		pmap_stats_update(pmap, 1, 0);

Index: src/sys/arch/x86/x86/vm_machdep.c
diff -u src/sys/arch/x86/x86/vm_machdep.c:1.28.6.2 src/sys/arch/x86/x86/vm_machdep.c:1.28.6.3
--- src/sys/arch/x86/x86/vm_machdep.c:1.28.6.2	Sat Mar 17 11:23:18 2018
+++ src/sys/arch/x86/x86/vm_machdep.c	Thu Mar 22 16:59:04 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: vm_machdep.c,v 1.28.6.2 2018/03/17 11:23:18 martin Exp $	*/
+/*	$NetBSD: vm_machdep.c,v 1.28.6.3 2018/03/22 16:59:04 martin Exp $	*/
 
 /*-
  * Copyright (c) 1982, 1986 The Regents of the University of California.
@@ -80,7 +80,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vm_machdep.c,v 1.28.6.2 2018/03/17 11:23:18 martin Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vm_machdep.c,v 1.28.6.3 2018/03/22 16:59:04 martin Exp $");
 
 #include "opt_mtrr.h"
 
@@ -178,9 +178,16 @@ cpu_lwp_fork(struct lwp *l1, struct lwp 
 	 * returns normally.
 	 */
 	uv = uvm_lwp_getuarea(l2);
+	KASSERT(uv % PAGE_SIZE == 0);
 
 #ifdef __x86_64__
-	pcb2->pcb_rsp0 = (uv + USPACE - 16) & ~0xf;
+#ifdef SVS
+	pcb2->pcb_rsp0 = (uv + USPACE - PAGE_SIZE +
+	    sizeof(struct trapframe));
+	KASSERT((pcb2->pcb_rsp0 & 0xF) == 0);
+#else
+	pcb2->pcb_rsp0 = (uv + USPACE - 16);
+#endif
 	tf = (struct trapframe *)pcb2->pcb_rsp0 - 1;
 #else
 	pcb2->pcb_esp0 = (uv + USPACE - 16);

Index: src/sys/arch/x86/x86/x86_machdep.c
diff -u src/sys/arch/x86/x86/x86_machdep.c:1.91.4.1 src/sys/arch/x86/x86/x86_machdep.c:1.91.4.2
--- src/sys/arch/x86/x86/x86_machdep.c:1.91.4.1	Wed Jun 21 17:41:50 2017
+++ src/sys/arch/x86/x86/x86_machdep.c	Thu Mar 22 16:59:04 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: x86_machdep.c,v 1.91.4.1 2017/06/21 17:41:50 snj Exp $	*/
+/*	$NetBSD: x86_machdep.c,v 1.91.4.2 2018/03/22 16:59:04 martin Exp $	*/
 
 /*-
  * Copyright (c) 2002, 2006, 2007 YAMAMOTO Takashi,
@@ -31,11 +31,12 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: x86_machdep.c,v 1.91.4.1 2017/06/21 17:41:50 snj Exp $");
+__KERNEL_RCSID(0, "$NetBSD: x86_machdep.c,v 1.91.4.2 2018/03/22 16:59:04 martin Exp $");
 
 #include "opt_modular.h"
 #include "opt_physmem.h"
 #include "opt_splash.h"
+#include "opt_svs.h"
 
 #include <sys/types.h>
 #include <sys/param.h>
@@ -1179,6 +1180,22 @@ SYSCTL_SETUP(sysctl_machdep_setup, "sysc
 		       NULL, 0, &use_pae, 0,
 		       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
 
+#ifdef SVS
+	int sysctl_machdep_svs_enabled(SYSCTLFN_ARGS);
+	const struct sysctlnode *svs_rnode = NULL;
+	sysctl_createv(clog, 0, NULL, &svs_rnode,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_NODE, "svs", NULL,
+		       NULL, 0, NULL, 0,
+		       CTL_MACHDEP, CTL_CREATE);
+	sysctl_createv(clog, 0, &svs_rnode, &svs_rnode,
+		       CTLFLAG_READWRITE,
+		       CTLTYPE_BOOL, "enabled",
+		       SYSCTL_DESCR("Whether the kernel uses SVS"),
+		       sysctl_machdep_svs_enabled, 0, &svs_enabled, 0,
+		       CTL_CREATE, CTL_EOL);
+#endif
+
 	/* None of these can ever change once the system has booted */
 	const_sysctl(clog, "fpu_present", CTLTYPE_INT, i386_fpu_present,
 	    CPU_FPU_PRESENT);

Index: src/sys/arch/xen/conf/files.compat
diff -u src/sys/arch/xen/conf/files.compat:1.25.8.1 src/sys/arch/xen/conf/files.compat:1.25.8.2
--- src/sys/arch/xen/conf/files.compat:1.25.8.1	Tue Aug  1 23:18:30 2017
+++ src/sys/arch/xen/conf/files.compat	Thu Mar 22 16:59:04 2018
@@ -1,4 +1,4 @@
-#	$NetBSD: files.compat,v 1.25.8.1 2017/08/01 23:18:30 snj Exp $
+#	$NetBSD: files.compat,v 1.25.8.2 2018/03/22 16:59:04 martin Exp $
 #	NetBSD: files.x86,v 1.10 2003/10/08 17:30:00 bouyer Exp 
 
 # options for MP configuration through the MP spec
@@ -29,6 +29,7 @@ defflag	opt_pcifixup.h		XXXOPT_PCIFIXUP
 defflag	opt_vm86.h			XXXVM86
 
 defflag opt_pmc.h			XXXPMC
+defflag opt_svs.h		XXXSVS
 
 # User-settable LDT (used by WINE)
 defflag	opt_user_ldt.h			XXXUSER_LDT

Added files:

Index: src/sys/arch/x86/x86/svs.c
diff -u /dev/null src/sys/arch/x86/x86/svs.c:1.14.2.2
--- /dev/null	Thu Mar 22 16:59:04 2018
+++ src/sys/arch/x86/x86/svs.c	Thu Mar 22 16:59:04 2018
@@ -0,0 +1,753 @@
+/*	$NetBSD: svs.c,v 1.14.2.2 2018/03/22 16:59:04 martin Exp $	*/
+
+/*
+ * Copyright (c) 2018 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Maxime Villard.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: svs.c,v 1.14.2.2 2018/03/22 16:59:04 martin Exp $");
+
+#include "opt_svs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/cpu.h>
+#include <sys/sysctl.h>
+#include <sys/xcall.h>
+
+#include <x86/cputypes.h>
+#include <machine/cpuvar.h>
+#include <machine/frameasm.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_page.h>
+
+/*
+ * Separate Virtual Space
+ *
+ * A per-cpu L4 page is maintained in ci_svs_updirpa. During each context
+ * switch to a user pmap, the lower half of updirpa is populated with the
+ * entries containing the userland pages.
+ *
+ * ~~~~~~~~~~ The UTLS Page ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * 
+ * We use a special per-cpu page that we call UTLS, for User Thread Local
+ * Storage. Each CPU has one UTLS page. This page has two VAs:
+ *
+ *  o When the user page tables are loaded in CR3, the VA to access this
+ *    page is &pcpuarea->utls, defined as SVS_UTLS in assembly. This VA is
+ *    _constant_ across CPUs, but in the user page tables this VA points to
+ *    the physical page of the UTLS that is _local_ to the CPU.
+ *
+ *  o When the kernel page tables are loaded in CR3, the VA to access this
+ *    page is ci->ci_svs_utls.
+ *
+ * +----------------------------------------------------------------------+
+ * | CPU0 Local Data                                      (Physical Page) |
+ * | +------------------+                                 +-------------+ |
+ * | | User Page Tables | SVS_UTLS ---------------------> | cpu0's UTLS | |
+ * | +------------------+                                 +-------------+ |
+ * +-------------------------------------------------------------^--------+
+ *                                                               |
+ *                                                               +----------+
+ *                                                                          |
+ * +----------------------------------------------------------------------+ |
+ * | CPU1 Local Data                                      (Physical Page) | |
+ * | +------------------+                                 +-------------+ | |
+ * | | User Page Tables | SVS_UTLS ---------------------> | cpu1's UTLS | | |
+ * | +------------------+                                 +-------------+ | |
+ * +-------------------------------------------------------------^--------+ |
+ *                                                               |          |
+ *   +------------------+                 /----------------------+          |
+ *   | Kern Page Tables | ci->ci_svs_utls                                   |
+ *   +------------------+                 \---------------------------------+
+ *
+ * The goal of the UTLS page is to provide an area where we can store whatever
+ * we want, in a way that it is accessible both when the Kernel and when the
+ * User page tables are loaded in CR3.
+ *
+ * We store in the UTLS page three 64bit values:
+ *
+ *  o UTLS_KPDIRPA: the value we must put in CR3 in order to load the kernel
+ *    page tables.
+ *
+ *  o UTLS_SCRATCH: a dummy place where we temporarily store a value during
+ *    the syscall entry procedure.
+ *
+ *  o UTLS_RSP0: the value we must put in RSP in order to have a stack where
+ *    we can push the register states. This is used only during the syscall
+ *    entry procedure, because there the CPU does not automatically switch
+ *    RSP (it does not use the TSS.rsp0 mechanism described below).
+ *
+ * ~~~~~~~~~~ The Stack Switching Mechanism Without SVS ~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * The kernel stack is per-lwp (pcb_rsp0). When doing a context switch between
+ * two user LWPs, the kernel updates TSS.rsp0 (which is per-cpu) to point to
+ * the stack of the new LWP. Then the execution continues. At some point, the
+ * user LWP we context-switched to will perform a syscall or will receive an
+ * interrupt. There, the CPU will automatically read TSS.rsp0 and use it as a
+ * stack. The kernel then pushes the register states on this stack, and
+ * executes in kernel mode normally.
+ *
+ * TSS.rsp0 is used by the CPU only during ring3->ring0 transitions. Therefore,
+ * when an interrupt is received while we were in kernel mode, the CPU does not
+ * read TSS.rsp0. Instead, it just uses the current stack.
+ *
+ * ~~~~~~~~~~ The Stack Switching Mechanism With SVS ~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * In the pcpu_area structure, pointed to by the "pcpuarea" variable, each CPU
+ * has a two-page rsp0 entry (pcpuarea->ent[cid].rsp0). These two pages do
+ * _not_ have associated physical addresses. They are only two VAs.
+ *
+ * The first page is unmapped and acts as a redzone. The second page is
+ * dynamically kentered into the highest page of the real per-lwp kernel stack;
+ * but pay close attention, it is kentered _only_ in the user page tables.
+ * That is to say, the VA of this second page is mapped when the user page
+ * tables are loaded, but not mapped when the kernel page tables are loaded.
+ *
+ * During a context switch, svs_lwp_switch() gets called first. This function
+ * does the kenter job described above, not in the kernel page tables (that
+ * are currently loaded), but in the user page tables (that are not loaded).
+ * 
+ *           VIRTUAL ADDRESSES                     PHYSICAL ADDRESSES
+ *
+ * +-----------------------------+
+ * |      KERNEL PAGE TABLES     |
+ * |    +-------------------+    |                +-------------------+
+ * |    | pcb_rsp0 (page 0) | ------------------> | pcb_rsp0 (page 0) |
+ * |    +-------------------+    |                +-------------------+
+ * |    | pcb_rsp0 (page 1) | ------------------> | pcb_rsp0 (page 1) |
+ * |    +-------------------+    |                +-------------------+
+ * |    | pcb_rsp0 (page 2) | ------------------> | pcb_rsp0 (page 2) |
+ * |    +-------------------+    |                +-------------------+
+ * |    | pcb_rsp0 (page 3) | ------------------> | pcb_rsp0 (page 3) |
+ * |    +-------------------+    |            +-> +-------------------+
+ * +-----------------------------+            |
+ *                                            |
+ * +---------------------------------------+  |
+ * |           USER PAGE TABLES            |  |
+ * | +----------------------------------+  |  |
+ * | | pcpuarea->ent[cid].rsp0 (page 0) |  |  |
+ * | +----------------------------------+  |  |
+ * | | pcpuarea->ent[cid].rsp0 (page 1) | ----+
+ * | +----------------------------------+  |
+ * +---------------------------------------+
+ *
+ * After svs_lwp_switch() gets called, we set pcpuarea->ent[cid].rsp0 (page 1)
+ * in TSS.rsp0. Later, when returning to userland on the lwp we context-
+ * switched to, we will load the user page tables and execute in userland
+ * normally.
+ *
+ * Next time an interrupt or syscall is received, the CPU will automatically
+ * use TSS.rsp0 as a stack. Here it is executing with the user page tables
+ * loaded, and therefore TSS.rsp0 is _mapped_.
+ *
+ * As part of the kernel entry procedure, we now switch CR3 to load the kernel
+ * page tables. Here, we are still using the stack pointer we set in TSS.rsp0.
+ *
+ * Remember that it was only one page of stack which was mapped only in the
+ * user page tables. We just switched to the kernel page tables, so we must
+ * update RSP to be the real per-lwp kernel stack (pcb_rsp0). And we do so,
+ * without touching the stack (since it is now unmapped, touching it would
+ * fault).
+ *
+ * After we updated RSP, we can continue execution exactly as in the non-SVS
+ * case. We don't need to copy the values the CPU pushed on TSS.rsp0: even if
+ * we updated RSP to a totally different VA, this VA points to the same
+ * physical page as TSS.rsp0. So in the end, the values the CPU pushed are
+ * still here even with the new RSP.
+ *
+ * Thanks to this double-kenter optimization, we don't need to copy the
+ * trapframe during each user<->kernel transition.
+ *
+ * ~~~~~~~~~~ Notes On Locking And Synchronization ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  o Touching ci_svs_updir without holding ci_svs_mtx first is *not*
+ *    allowed.
+ *
+ *  o pm_kernel_cpus contains the set of CPUs that have the pmap loaded
+ *    in their CR3 register. It must *not* be replaced by pm_cpus.
+ *
+ *  o When a context switch on the current CPU is made from a user LWP
+ *    towards a kernel LWP, CR3 is not updated. Therefore, the pmap's
+ *    pm_kernel_cpus still contains the current CPU. It implies that the
+ *    remote CPUs that execute other threads of the user process we just
+ *    left will keep synchronizing us against their changes.
+ *
+ * ~~~~~~~~~~ List Of Areas That Are Removed From Userland ~~~~~~~~~~~~~~~~~~~
+ *
+ *  o PTE Space
+ *  o Direct Map
+ *  o Remote PCPU Areas
+ *  o Kernel Heap
+ *  o Kernel Image
+ *
+ * ~~~~~~~~~~ Todo List ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * Ordered from highest priority to lowest:
+ *
+ *  o The NMI stack is not double-entered. Therefore if we ever receive an NMI
+ *    and leave it, the content of the stack will be visible to userland (via
+ *    Meltdown). Normally we never leave NMIs, unless a privileged user
+ *    launched PMCs. That's unlikely to happen, our PMC support is pretty
+ *    minimal, and privileged only.
+ *
+ *  o Narrow down the entry points: hide the 'jmp handler' instructions. This
+ *    makes sense on GENERIC_KASLR kernels.
+ *
+ *  o Right now there is only one global LDT, and that's not compatible with
+ *    USER_LDT.
+ */
+
+bool svs_enabled __read_mostly = false;
+
+struct svs_utls {
+	paddr_t kpdirpa;
+	uint64_t scratch;
+	vaddr_t rsp0;
+};
+
+static pd_entry_t *
+svs_tree_add(struct cpu_info *ci, vaddr_t va)
+{
+	extern const vaddr_t ptp_masks[];
+	extern const int ptp_shifts[];
+	extern const long nbpd[];
+	pd_entry_t *dstpde;
+	size_t i, pidx, mod;
+	struct vm_page *pg;
+	paddr_t pa;
+
+	dstpde = ci->ci_svs_updir;
+	mod = (size_t)-1;
+
+	for (i = PTP_LEVELS; i > 1; i--) {
+		pidx = pl_i(va % mod, i);
+
+		if (!pmap_valid_entry(dstpde[pidx])) {
+			pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
+			if (pg == 0)
+				panic("%s: failed to allocate PA for CPU %d\n",
+					__func__, cpu_index(ci));
+			pa = VM_PAGE_TO_PHYS(pg);
+
+			dstpde[pidx] = PG_V | PG_RW | pa;
+		}
+
+		pa = (paddr_t)(dstpde[pidx] & PG_FRAME);
+		dstpde = (pd_entry_t *)PMAP_DIRECT_MAP(pa);
+		mod = nbpd[i-1];
+	}
+
+	return dstpde;
+}
+
+static void
+svs_page_add(struct cpu_info *ci, vaddr_t va)
+{
+	pd_entry_t *srcpde, *dstpde, pde;
+	size_t idx, pidx;
+	paddr_t pa;
+
+	/* Create levels L4, L3 and L2. */
+	dstpde = svs_tree_add(ci, va);
+
+	pidx = pl1_i(va % NBPD_L2);
+
+	/*
+	 * If 'va' is in a large page, we need to compute its physical
+	 * address manually.
+	 */
+	idx = pl2_i(va);
+	srcpde = L2_BASE;
+	if (!pmap_valid_entry(srcpde[idx])) {
+		panic("%s: L2 page not mapped", __func__);
+	}
+	if (srcpde[idx] & PG_PS) {
+		pa = srcpde[idx] & PG_2MFRAME;
+		pa += (paddr_t)(va % NBPD_L2);
+		pde = (srcpde[idx] & ~(PG_G|PG_PS|PG_2MFRAME)) | pa;
+
+		if (pmap_valid_entry(dstpde[pidx])) {
+			panic("%s: L1 page already mapped", __func__);
+		}
+		dstpde[pidx] = pde;
+		return;
+	}
+
+	/*
+	 * Normal page, just copy the PDE.
+	 */
+	idx = pl1_i(va);
+	srcpde = L1_BASE;
+	if (!pmap_valid_entry(srcpde[idx])) {
+		panic("%s: L1 page not mapped", __func__);
+	}
+	if (pmap_valid_entry(dstpde[pidx])) {
+		panic("%s: L1 page already mapped", __func__);
+	}
+	dstpde[pidx] = srcpde[idx] & ~(PG_G);
+}
+
+static void
+svs_rsp0_init(struct cpu_info *ci)
+{
+	const cpuid_t cid = cpu_index(ci);
+	vaddr_t va, rsp0;
+	pd_entry_t *pd;
+	size_t pidx;
+
+	rsp0 = (vaddr_t)&pcpuarea->ent[cid].rsp0;
+
+	/* The first page is a redzone. */
+	va = rsp0 + PAGE_SIZE;
+
+	/* Create levels L4, L3 and L2. */
+	pd = svs_tree_add(ci, va);
+
+	/* Get the info for L1. */
+	pidx = pl1_i(va % NBPD_L2);
+	if (pmap_valid_entry(pd[pidx])) {
+		panic("%s: rsp0 page already mapped", __func__);
+	}
+
+	ci->ci_svs_rsp0_pte = (pt_entry_t *)&pd[pidx];
+	ci->ci_svs_rsp0 = rsp0 + PAGE_SIZE + sizeof(struct trapframe);
+	ci->ci_svs_ursp0 = ci->ci_svs_rsp0 - sizeof(struct trapframe);
+	ci->ci_svs_krsp0 = 0;
+}
+
+static void
+svs_utls_init(struct cpu_info *ci)
+{
+	const vaddr_t utlsva = (vaddr_t)&pcpuarea->utls;
+	struct svs_utls *utls;
+	struct vm_page *pg;
+	pd_entry_t *pd;
+	size_t pidx;
+	paddr_t pa;
+	vaddr_t va;
+
+	/* Create levels L4, L3 and L2 of the UTLS page. */
+	pd = svs_tree_add(ci, utlsva);
+
+	/* Allocate L1. */
+	pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
+	if (pg == 0)
+		panic("%s: failed to allocate PA for CPU %d\n", __func__,
+		    cpu_index(ci));
+	pa = VM_PAGE_TO_PHYS(pg);
+
+	/* Enter L1. */
+	if (pmap_valid_entry(L1_BASE[pl1_i(utlsva)])) {
+		panic("%s: local page already mapped", __func__);
+	}
+	pidx = pl1_i(utlsva % NBPD_L2);
+	if (pmap_valid_entry(pd[pidx])) {
+		panic("%s: L1 page already mapped", __func__);
+	}
+	pd[pidx] = PG_V | PG_RW | pmap_pg_nx | pa;
+
+	/*
+	 * Now, allocate a VA in the kernel map, that points to the UTLS
+	 * page. After that, the UTLS page will be accessible in kernel
+	 * mode via ci_svs_utls.
+	 */
+	va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
+	    UVM_KMF_VAONLY|UVM_KMF_NOWAIT);
+	if (va == 0) {
+		panic("%s: unable to allocate VA\n", __func__);
+	}
+	pmap_kenter_pa(va, pa, VM_PROT_READ|VM_PROT_WRITE, 0);
+	pmap_update(pmap_kernel());
+
+	ci->ci_svs_utls = va;
+
+	/* Initialize the constant fields of the UTLS page */
+	utls = (struct svs_utls *)ci->ci_svs_utls;
+	utls->rsp0 = ci->ci_svs_rsp0;
+}
+
+static void
+svs_range_add(struct cpu_info *ci, vaddr_t va, size_t size)
+{
+	size_t i, n;
+
+	KASSERT(size % PAGE_SIZE == 0);
+	n = size / PAGE_SIZE;
+	for (i = 0; i < n; i++) {
+		svs_page_add(ci, va + i * PAGE_SIZE);
+	}
+}
+
+void
+cpu_svs_init(struct cpu_info *ci)
+{
+	extern char __text_user_start;
+	extern char __text_user_end;
+	const cpuid_t cid = cpu_index(ci);
+	struct vm_page *pg;
+
+	KASSERT(ci != NULL);
+
+	pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
+	if (pg == 0)
+		panic("%s: failed to allocate L4 PA for CPU %d\n",
+			__func__, cpu_index(ci));
+	ci->ci_svs_updirpa = VM_PAGE_TO_PHYS(pg);
+
+	ci->ci_svs_updir = (pt_entry_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
+		UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
+	if (ci->ci_svs_updir == NULL)
+		panic("%s: failed to allocate L4 VA for CPU %d\n",
+			__func__, cpu_index(ci));
+
+	pmap_kenter_pa((vaddr_t)ci->ci_svs_updir, ci->ci_svs_updirpa,
+		VM_PROT_READ | VM_PROT_WRITE, 0);
+
+	pmap_update(pmap_kernel());
+
+	ci->ci_svs_kpdirpa = pmap_pdirpa(pmap_kernel(), 0);
+
+	mutex_init(&ci->ci_svs_mtx, MUTEX_DEFAULT, IPL_VM);
+
+	svs_page_add(ci, (vaddr_t)&pcpuarea->idt);
+	svs_page_add(ci, (vaddr_t)&pcpuarea->ldt);
+	svs_range_add(ci, (vaddr_t)&pcpuarea->ent[cid],
+	    offsetof(struct pcpu_entry, rsp0));
+	svs_range_add(ci, (vaddr_t)&__text_user_start,
+	    (vaddr_t)&__text_user_end - (vaddr_t)&__text_user_start);
+
+	svs_rsp0_init(ci);
+	svs_utls_init(ci);
+}
+
+void
+svs_pmap_sync(struct pmap *pmap, int index)
+{
+	CPU_INFO_ITERATOR cii;
+	struct cpu_info *ci;
+	cpuid_t cid;
+
+	KASSERT(pmap != NULL);
+	KASSERT(pmap != pmap_kernel());
+	KASSERT(mutex_owned(pmap->pm_lock));
+	KASSERT(kpreempt_disabled());
+	KASSERT(index < 255);
+
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		cid = cpu_index(ci);
+
+		if (!kcpuset_isset(pmap->pm_kernel_cpus, cid)) {
+			continue;
+		}
+
+		/* take the lock and check again */
+		mutex_enter(&ci->ci_svs_mtx);
+		if (kcpuset_isset(pmap->pm_kernel_cpus, cid)) {
+			ci->ci_svs_updir[index] = pmap->pm_pdir[index];
+		}
+		mutex_exit(&ci->ci_svs_mtx);
+	}
+}
+
+void
+svs_lwp_switch(struct lwp *oldlwp, struct lwp *newlwp)
+{
+	struct cpu_info *ci = curcpu();
+	struct svs_utls *utls;
+	struct pcb *pcb;
+	pt_entry_t *pte;
+	uintptr_t rsp0;
+	vaddr_t va;
+
+	if (newlwp->l_flag & LW_SYSTEM) {
+		return;
+	}
+
+#ifdef DIAGNOSTIC
+	if (oldlwp != NULL && !(oldlwp->l_flag & LW_SYSTEM)) {
+		pcb = lwp_getpcb(oldlwp);
+		rsp0 = pcb->pcb_rsp0;
+		va = rounddown(rsp0, PAGE_SIZE);
+		KASSERT(ci->ci_svs_krsp0 == rsp0 - sizeof(struct trapframe));
+		pte = ci->ci_svs_rsp0_pte;
+		KASSERT(*pte == L1_BASE[pl1_i(va)]);
+	}
+#endif
+
+	pcb = lwp_getpcb(newlwp);
+	rsp0 = pcb->pcb_rsp0;
+	va = rounddown(rsp0, PAGE_SIZE);
+
+	/* Update the kernel rsp0 in cpu_info */
+	ci->ci_svs_krsp0 = rsp0 - sizeof(struct trapframe);
+	KASSERT((ci->ci_svs_krsp0 % PAGE_SIZE) ==
+	    (ci->ci_svs_ursp0 % PAGE_SIZE));
+
+	utls = (struct svs_utls *)ci->ci_svs_utls;
+	utls->scratch = 0;
+
+	/*
+	 * Enter the user rsp0. We don't need to flush the TLB here, since
+	 * the user page tables are not loaded.
+	 */
+	pte = ci->ci_svs_rsp0_pte;
+	*pte = L1_BASE[pl1_i(va)];
+}
+
+static inline pt_entry_t
+svs_pte_atomic_read(struct pmap *pmap, size_t idx)
+{
+	/*
+	 * XXX: We don't have a basic atomic_fetch_64 function?
+	 */
+	return atomic_cas_64(&pmap->pm_pdir[idx], 666, 666);
+}
+
+/*
+ * We may come here with the pmap unlocked. So read its PTEs atomically. If
+ * a remote CPU is updating them at the same time, it's not a problem: the
+ * remote CPU will call svs_pmap_sync afterwards, and our updirpa will be
+ * synchronized properly.
+ */
+void
+svs_pdir_switch(struct pmap *pmap)
+{
+	struct cpu_info *ci = curcpu();
+	struct svs_utls *utls;
+	pt_entry_t pte;
+	size_t i;
+
+	KASSERT(kpreempt_disabled());
+	KASSERT(pmap != pmap_kernel());
+
+	ci->ci_svs_kpdirpa = pmap_pdirpa(pmap, 0);
+
+	/* Update the info in the UTLS page */
+	utls = (struct svs_utls *)ci->ci_svs_utls;
+	utls->kpdirpa = ci->ci_svs_kpdirpa;
+
+	mutex_enter(&ci->ci_svs_mtx);
+
+	/* User slots. */
+	for (i = 0; i < 255; i++) {
+		pte = svs_pte_atomic_read(pmap, i);
+		ci->ci_svs_updir[i] = pte;
+	}
+
+	mutex_exit(&ci->ci_svs_mtx);
+}
+
+static void
+svs_enable(void)
+{
+	extern uint8_t svs_enter, svs_enter_end;
+	extern uint8_t svs_enter_altstack, svs_enter_altstack_end;
+	extern uint8_t svs_leave, svs_leave_end;
+	extern uint8_t svs_leave_altstack, svs_leave_altstack_end;
+	u_long psl, cr0;
+	uint8_t *bytes;
+	size_t size;
+
+	svs_enabled = true;
+
+	x86_patch_window_open(&psl, &cr0);
+
+	bytes = &svs_enter;
+	size = (size_t)&svs_enter_end - (size_t)&svs_enter;
+	x86_hotpatch(HP_NAME_SVS_ENTER, bytes, size);
+
+	bytes = &svs_enter_altstack;
+	size = (size_t)&svs_enter_altstack_end -
+	    (size_t)&svs_enter_altstack;
+	x86_hotpatch(HP_NAME_SVS_ENTER_ALT, bytes, size);
+
+	bytes = &svs_leave;
+	size = (size_t)&svs_leave_end - (size_t)&svs_leave;
+	x86_hotpatch(HP_NAME_SVS_LEAVE, bytes, size);
+
+	bytes = &svs_leave_altstack;
+	size = (size_t)&svs_leave_altstack_end -
+	    (size_t)&svs_leave_altstack;
+	x86_hotpatch(HP_NAME_SVS_LEAVE_ALT, bytes, size);
+
+	x86_patch_window_close(psl, cr0);
+}
+
+static void
+svs_disable_hotpatch(void)
+{
+	extern uint8_t nosvs_enter, nosvs_enter_end;
+	extern uint8_t nosvs_enter_altstack, nosvs_enter_altstack_end;
+	extern uint8_t nosvs_leave, nosvs_leave_end;
+	extern uint8_t nosvs_leave_altstack, nosvs_leave_altstack_end;
+	u_long psl, cr0;
+	uint8_t *bytes;
+	size_t size;
+
+	x86_patch_window_open(&psl, &cr0);
+
+	bytes = &nosvs_enter;
+	size = (size_t)&nosvs_enter_end - (size_t)&nosvs_enter;
+	x86_hotpatch(HP_NAME_SVS_ENTER, bytes, size);
+
+	bytes = &nosvs_enter_altstack;
+	size = (size_t)&nosvs_enter_altstack_end -
+	    (size_t)&nosvs_enter_altstack;
+	x86_hotpatch(HP_NAME_SVS_ENTER_ALT, bytes, size);
+
+	bytes = &nosvs_leave;
+	size = (size_t)&nosvs_leave_end - (size_t)&nosvs_leave;
+	x86_hotpatch(HP_NAME_SVS_LEAVE, bytes, size);
+
+	bytes = &nosvs_leave_altstack;
+	size = (size_t)&nosvs_leave_altstack_end -
+	    (size_t)&nosvs_leave_altstack;
+	x86_hotpatch(HP_NAME_SVS_LEAVE_ALT, bytes, size);
+
+	x86_patch_window_close(psl, cr0);
+}
+
+static volatile unsigned long svs_cpu_barrier1 __cacheline_aligned;
+static volatile unsigned long svs_cpu_barrier2 __cacheline_aligned;
+typedef void (vector)(void);
+
+static void
+svs_disable_cpu(void *arg1, void *arg2)
+{
+	struct cpu_info *ci = curcpu();
+	extern vector Xsyscall;
+	u_long psl;
+
+	psl = x86_read_psl();
+	x86_disable_intr();
+
+	atomic_dec_ulong(&svs_cpu_barrier1);
+	while (atomic_cas_ulong(&svs_cpu_barrier1, 0, 0) != 0) {
+		x86_pause();
+	}
+
+	/* cpu0 is the one that does the hotpatch job */
+	if (ci == &cpu_info_primary) {
+		svs_enabled = false;
+		svs_disable_hotpatch();
+	}
+
+	/* put back the non-SVS syscall entry point */
+	wrmsr(MSR_LSTAR, (uint64_t)Xsyscall);
+
+	/* enable global pages */
+	if (cpu_feature[0] & CPUID_PGE)
+		lcr4(rcr4() | CR4_PGE);
+
+	atomic_dec_ulong(&svs_cpu_barrier2);
+	while (atomic_cas_ulong(&svs_cpu_barrier2, 0, 0) != 0) {
+		x86_pause();
+	}
+
+	/* Write back and invalidate cache, flush pipelines. */
+	wbinvd();
+	x86_flush();
+
+	x86_write_psl(psl);
+}
+
+static int
+svs_disable(void)
+{
+	struct cpu_info *ci = NULL;
+	CPU_INFO_ITERATOR cii;
+	uint64_t xc;
+
+	mutex_enter(&cpu_lock);
+
+	/*
+	 * We expect all the CPUs to be online.
+	 */
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		struct schedstate_percpu *spc = &ci->ci_schedstate;
+		if (spc->spc_flags & SPCF_OFFLINE) {
+			printf("[!] cpu%d offline, SVS not disabled\n",
+			    cpu_index(ci));
+			mutex_exit(&cpu_lock);
+			return EOPNOTSUPP;
+		}
+	}
+
+	svs_cpu_barrier1 = ncpu;
+	svs_cpu_barrier2 = ncpu;
+
+	printf("[+] Disabling SVS...");
+	xc = xc_broadcast(0, svs_disable_cpu, NULL, NULL);
+	xc_wait(xc);
+	printf(" done!\n");
+
+	mutex_exit(&cpu_lock);
+
+	return 0;
+}
+
+int sysctl_machdep_svs_enabled(SYSCTLFN_ARGS);
+
+int
+sysctl_machdep_svs_enabled(SYSCTLFN_ARGS)
+{
+	struct sysctlnode node;
+	int error, val;
+
+	val = *(int *)rnode->sysctl_data;
+
+	node = *rnode;
+	node.sysctl_data = &val;
+
+	error = sysctl_lookup(SYSCTLFN_CALL(&node));
+	if (error != 0 || newp == NULL)
+		return error;
+
+	if (val == 1) {
+		error = EINVAL;
+	} else {
+		if (svs_enabled)
+			error = svs_disable();
+		else
+			error = 0;
+	}
+
+	return error;
+}
+
+void
+svs_init(void)
+{
+	if (cpu_vendor != CPUVENDOR_INTEL) {
+		return;
+	}
+	svs_enable();
+}

Reply via email to