Module Name: src Committed By: martin Date: Thu Mar 22 16:59:04 UTC 2018
Modified Files: src/sys/arch/amd64/amd64 [netbsd-8]: amd64_trap.S db_machdep.c genassym.cf locore.S machdep.c trap.c vector.S src/sys/arch/amd64/conf [netbsd-8]: GENERIC kern.ldscript src/sys/arch/amd64/include [netbsd-8]: frameasm.h param.h pmap.h src/sys/arch/x86/conf [netbsd-8]: files.x86 src/sys/arch/x86/include [netbsd-8]: cpu.h pmap.h src/sys/arch/x86/x86 [netbsd-8]: cpu.c pmap.c vm_machdep.c x86_machdep.c src/sys/arch/xen/conf [netbsd-8]: files.compat Added Files: src/sys/arch/x86/x86 [netbsd-8]: svs.c Log Message: Pull up the following revisions, requested by maxv in ticket #652: sys/arch/amd64/amd64/amd64_trap.S upto 1.39 (partial, patch) sys/arch/amd64/amd64/db_machdep.c 1.6 (patch) sys/arch/amd64/amd64/genassym.cf 1.65,1.66,1.67 (patch) sys/arch/amd64/amd64/locore.S upto 1.159 (partial, patch) sys/arch/amd64/amd64/machdep.c 1.299-1.302 (patch) sys/arch/amd64/amd64/trap.c upto 1.113 (partial, patch) sys/arch/amd64/amd64/amd64/vector.S upto 1.61 (partial, patch) sys/arch/amd64/conf/GENERIC 1.477,1.478 (patch) sys/arch/amd64/conf/kern.ldscript 1.26 (patch) sys/arch/amd64/include/frameasm.h upto 1.37 (partial, patch) sys/arch/amd64/include/param.h 1.25 (patch) sys/arch/amd64/include/pmap.h 1.41,1.43,1.44 (patch) sys/arch/x86/conf/files.x86 1.91,1.93 (patch) sys/arch/x86/include/cpu.h 1.88,1.89 (patch) sys/arch/x86/include/pmap.h 1.75 (patch) sys/arch/x86/x86/cpu.c 1.144,1.146,1.148,1.149 (patch) sys/arch/x86/x86/pmap.c upto 1.289 (partial, patch) sys/arch/x86/x86/vm_machdep.c 1.31,1.32 (patch) sys/arch/x86/x86/x86_machdep.c 1.104,1.106,1.108 (patch) sys/arch/x86/x86/svs.c 1.1-1.14 sys/arch/xen/conf/files.compat 1.30 (patch) Backport SVS. Not enabled yet. To generate a diff of this commit: cvs rdiff -u -r1.5.6.1 -r1.5.6.2 src/sys/arch/amd64/amd64/amd64_trap.S cvs rdiff -u -r1.4 -r1.4.30.1 src/sys/arch/amd64/amd64/db_machdep.c cvs rdiff -u -r1.60.10.1 -r1.60.10.2 src/sys/arch/amd64/amd64/genassym.cf cvs rdiff -u -r1.123.6.4 -r1.123.6.5 src/sys/arch/amd64/amd64/locore.S cvs rdiff -u -r1.255.6.5 -r1.255.6.6 src/sys/arch/amd64/amd64/machdep.c cvs rdiff -u -r1.96.4.1 -r1.96.4.2 src/sys/arch/amd64/amd64/trap.c cvs rdiff -u -r1.49.2.1 -r1.49.2.2 src/sys/arch/amd64/amd64/vector.S cvs rdiff -u -r1.459.2.5 -r1.459.2.6 src/sys/arch/amd64/conf/GENERIC cvs rdiff -u -r1.22.6.2 -r1.22.6.3 src/sys/arch/amd64/conf/kern.ldscript cvs rdiff -u -r1.20.32.1 -r1.20.32.2 src/sys/arch/amd64/include/frameasm.h cvs rdiff -u -r1.21.6.1 -r1.21.6.2 src/sys/arch/amd64/include/param.h cvs rdiff -u -r1.39 -r1.39.8.1 src/sys/arch/amd64/include/pmap.h cvs rdiff -u -r1.88 -r1.88.6.1 src/sys/arch/x86/conf/files.x86 cvs rdiff -u -r1.71.2.3 -r1.71.2.4 src/sys/arch/x86/include/cpu.h cvs rdiff -u -r1.64.6.1 -r1.64.6.2 src/sys/arch/x86/include/pmap.h cvs rdiff -u -r1.130.2.4 -r1.130.2.5 src/sys/arch/x86/x86/cpu.c cvs rdiff -u -r1.245.6.5 -r1.245.6.6 src/sys/arch/x86/x86/pmap.c cvs rdiff -u -r0 -r1.14.2.2 src/sys/arch/x86/x86/svs.c cvs rdiff -u -r1.28.6.2 -r1.28.6.3 src/sys/arch/x86/x86/vm_machdep.c cvs rdiff -u -r1.91.4.1 -r1.91.4.2 src/sys/arch/x86/x86/x86_machdep.c cvs rdiff -u -r1.25.8.1 -r1.25.8.2 src/sys/arch/xen/conf/files.compat Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/arch/amd64/amd64/amd64_trap.S diff -u src/sys/arch/amd64/amd64/amd64_trap.S:1.5.6.1 src/sys/arch/amd64/amd64/amd64_trap.S:1.5.6.2 --- src/sys/arch/amd64/amd64/amd64_trap.S:1.5.6.1 Wed Mar 7 14:50:56 2018 +++ src/sys/arch/amd64/amd64/amd64_trap.S Thu Mar 22 16:59:03 2018 @@ -1,4 +1,4 @@ -/* $NetBSD: amd64_trap.S,v 1.5.6.1 2018/03/07 14:50:56 martin Exp $ */ +/* $NetBSD: amd64_trap.S,v 1.5.6.2 2018/03/22 16:59:03 martin Exp $ */ /* * Copyright (c) 1998, 2007, 2008, 2017 The NetBSD Foundation, Inc. @@ -95,13 +95,19 @@ #define PRE_TRAP #endif +#define TRAPENTRY \ + INTRENTRY ; \ + jmp .Lalltraps_noentry + #define TRAP_NJ(a) PRE_TRAP ; pushq $(a) #define ZTRAP_NJ(a) PRE_TRAP ; pushq $0 ; pushq $(a) -#define TRAP(a) TRAP_NJ(a) ; jmp _C_LABEL(alltraps) -#define ZTRAP(a) ZTRAP_NJ(a) ; jmp _C_LABEL(alltraps) +#define TRAP(a) TRAP_NJ(a) ; TRAPENTRY +#define ZTRAP(a) ZTRAP_NJ(a) ; TRAPENTRY .text + TEXT_USER_BEGIN + IDTVEC(trap00) ZTRAP(T_DIVIDE) IDTVEC_END(trap00) @@ -128,6 +134,7 @@ IDTVEC(trap02) ZTRAP_NJ(T_NMI) subq $TF_REGSIZE,%rsp INTR_SAVE_GPRS + SVS_ENTER_ALTSTACK cld movw %gs,TF_GS(%rsp) movw %fs,TF_FS(%rsp) @@ -143,6 +150,7 @@ IDTVEC(trap02) movq %rsp,%rdi incq CPUVAR(NTRAP) call _C_LABEL(nmitrap) + SVS_LEAVE_ALTSTACK swapgs jmp .Lnmileave @@ -150,6 +158,7 @@ IDTVEC(trap02) movq %rsp,%rdi incq CPUVAR(NTRAP) call _C_LABEL(nmitrap) + SVS_LEAVE_ALTSTACK .Lnmileave: INTR_RESTORE_GPRS @@ -221,6 +230,7 @@ IDTVEC(trap08) TRAP_NJ(T_DOUBLEFLT) subq $TF_REGSIZE,%rsp INTR_SAVE_GPRS + SVS_ENTER_ALTSTACK testb $SEL_UPL,TF_CS(%rsp) jz 1f swapgs @@ -235,6 +245,7 @@ IDTVEC(trap08) incq CPUVAR(NTRAP) call _C_LABEL(doubletrap) + SVS_LEAVE_ALTSTACK INTR_RESTORE_GPRS testb $SEL_UPL,TF_CS(%rsp) @@ -260,22 +271,22 @@ IDTVEC_END(trap10) * equivalent of iret, if it does this code would be needed * in order to copy the user segment registers into the fault frame. */ -#define check_swapgs alltraps +#define kernuser_reenter alltraps #endif IDTVEC(trap11) /* #NP() Segment not present */ TRAP_NJ(T_SEGNPFLT) - jmp check_swapgs + jmp kernuser_reenter IDTVEC_END(trap11) IDTVEC(trap12) /* #SS() Stack exception */ TRAP_NJ(T_STKFLT) - jmp check_swapgs + jmp kernuser_reenter IDTVEC_END(trap12) IDTVEC(trap13) /* #GP() General protection */ TRAP_NJ(T_PROTFLT) - jmp check_swapgs + jmp kernuser_reenter IDTVEC_END(trap13) IDTVEC(trap14) @@ -352,68 +363,135 @@ IDTVEC(intrspurious) jmp .Lalltraps_checkusr IDTVEC_END(intrspurious) - +#ifndef kernuser_reenter /* - * trap() calls here when it detects a fault in INTRFASTEXIT (loading the - * segment registers or during the iret itself). The address of the (possibly - * reconstructed) user trap frame is passed as an argument. - * - * Typically the code will have raised a SIGSEGV which will be actioned - * by the code below. + * We need to worry about traps in kernel mode while the kernel %gs isn't + * loaded. When such traps happen, we have CPL=0 and %gs=userland, and we + * must perform an additional swapgs to get %gs=kernel. */ - .type _C_LABEL(trap_return_fault_return), @function -LABEL(trap_return_fault_return) - mov %rdi,%rsp /* frame for user return */ -#ifdef DIAGNOSTIC - /* We can't recover the saved %rbx, so suppress warning */ - movl CPUVAR(ILEVEL),%ebx -#endif - jmp .Lalltraps_checkusr -END(trap_return_fault_return) -#ifndef check_swapgs +#define TF_SMALL(val, reg) (val - TF_REGSIZE)(reg) +#define TF_SMALL_REGPUSHED(val, reg) (val - (TF_REGSIZE - 8))(reg) + /* - * We need to worry about traps in kernel mode while the kernel %gs isn't - * loaded. These are either faults on iretq during return to user or loads to - * %gs. + * It is possible that we received a trap in kernel mode, but with the user + * context loaded. There are three cases where this can happen: * - * When such traps happen, we have CPL=0 and %gs=userland, and we must perform - * an additional swapgs to get %gs=kernel. + * o Execution of IRETQ. + * o Reload of ES. + * o Reload of DS. + * + * When this happens, the kernel is re-entered in kernel mode, but the + * previous context is in kernel mode too. + * + * We have two iret frames in the stack. In the first one, we also pushed + * 'trapno' and 'err'. The 'rsp' field points to the outer iret frame: + * + * +---------------------------------------------------+ + * | trapno | err | rip | cs=ring0 | rflags | rsp | ss | + * +-------------------------------------------|-------+ + * | + * +---------------------------------+ + * | + * | +------------------------------------+ + * +--> | rip | cs=ring3 | rflags | rsp | ss | + * +------------------------------------+ + * + * We perform a three-step procedure: + * + * o We update RSP to point to the outer frame. This outer frame is in the + * same stack as the current frame, and likely just after the current + * frame. + * + * o We push, in this outer frame, the 'err' and 'trapno' fields of the + * CURRENT frame. + * + * o We do a normal INTRENTRY. Now that RSP points to the outer frame, + * everything behaves as if we had received a trap from the outer frame, + * that is to say, from userland directly. + * + * Finally, we jump to 'calltrap' and handle the trap smoothly. + * + * Two notes regarding SVS: + * + * o With SVS, we will receive the trap while the user page tables are + * loaded. That's not a problem, we don't touch anything unmapped here. + * + * o With SVS, when the user page tables are loaded, the stack is really + * small, and can contain only one trapframe structure. Therefore, in + * intrfastexit, we must save the GPRs and pop their part of the stack + * right away. If we weren't doing that, and the reload of ES faulted for + * example, then the CPU would try to push an iret frame on the current + * stack (nested), and would double-fault because it touches the redzone + * below the stack (see the documentation in x86/x86/svs.c). By popping + * the GPR part of the stack, we leave enough stack for the CPU to push + * an iret frame, and for us to push one 8-byte register (%rdi) too. */ -NENTRY(check_swapgs) - INTRENTRY_L(3f,1:) -2: + _ALIGN_TEXT +LABEL(kernuser_reenter) + testb $SEL_UPL,TF_SMALL(TF_CS, %rsp) + jz .Lkernelmode + +.Lnormal_entry: + INTRENTRY sti jmp calltrap -3: - /* - * Trap in kernel mode. - */ + +.Lkernelmode: + /* We will clobber %rdi */ + pushq %rdi + /* Case 1: fault on iretq? */ - movq TF_RIP(%rsp),%rax - cmpw $0xcf48,(%rax) /* Faulting instruction is iretq ? */ - jne 5f /* Jump if not */ - movq TF_RSP(%rsp),%rax /* Must read %rsp, may be a pad word */ - testb $SEL_UPL,8(%rax) /* Check %cs of outer iret frame */ - je 2b /* jump if iret was to kernel */ - jmp 1b /* to user - must restore %gs */ + leaq do_iret(%rip),%rdi + cmpq %rdi,TF_SMALL_REGPUSHED(TF_RIP, %rsp) + jne 5f + movq TF_SMALL_REGPUSHED(TF_RSP, %rsp),%rdi /* get %rsp */ + testb $SEL_UPL,8(%rdi) /* check %cs of outer iret frame */ + je .Lnormal_entry /* jump if iret was to kernel */ + jmp .Lkernelmode_but_user /* to user - must restore %gs */ 5: - /* Case 2: move to %gs? */ - movw (%rax),%ax - andb $070,%ah /* mask mod/rm from mod/reg/rm */ - cmpw $0x8e+050*256,%ax /* Any move to %gs (reg 5) */ - jne 2b /* No - normal kernel fault */ - jmp 1b /* Yes - restore %gs */ -END(check_swapgs) + /* Case 2: move to %es? */ + leaq do_mov_es(%rip),%rdi + cmpq %rdi,TF_SMALL_REGPUSHED(TF_RIP, %rsp) + je .Lkernelmode_but_user + + /* Case 3: move to %ds? */ + leaq do_mov_ds(%rip),%rdi + cmpq %rdi,TF_SMALL_REGPUSHED(TF_RIP, %rsp) + je .Lkernelmode_but_user + + /* None of the above cases: normal kernel fault */ + popq %rdi + jmp .Lnormal_entry + +.Lkernelmode_but_user: + /* + * Here we have %rdi pushed on the stack, hence 8+. + */ + movq %rsp,%rdi + movq TF_SMALL_REGPUSHED(TF_RSP, %rsp),%rsp + + /* Push tf_err and tf_trapno */ + pushq 8+8(%rdi) /* 8+8(%rdi) = current TF_ERR */ + pushq 8+0(%rdi) /* 8+0(%rdi) = current TF_TRAPNO */ + + /* Restore %rdi */ + movq (%rdi),%rdi + + jmp .Lnormal_entry +END(kernuser_reenter) #endif + TEXT_USER_END + /* * All traps go through here. Call the generic trap handler, and * check for ASTs afterwards. */ NENTRY(alltraps) INTRENTRY +.Lalltraps_noentry: STI(si) calltrap: Index: src/sys/arch/amd64/amd64/db_machdep.c diff -u src/sys/arch/amd64/amd64/db_machdep.c:1.4 src/sys/arch/amd64/amd64/db_machdep.c:1.4.30.1 --- src/sys/arch/amd64/amd64/db_machdep.c:1.4 Wed Oct 3 17:43:22 2012 +++ src/sys/arch/amd64/amd64/db_machdep.c Thu Mar 22 16:59:03 2018 @@ -1,4 +1,4 @@ -/* $NetBSD: db_machdep.c,v 1.4 2012/10/03 17:43:22 riastradh Exp $ */ +/* $NetBSD: db_machdep.c,v 1.4.30.1 2018/03/22 16:59:03 martin Exp $ */ /* * Mach Operating System @@ -26,7 +26,7 @@ * rights to redistribute these changes. */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: db_machdep.c,v 1.4 2012/10/03 17:43:22 riastradh Exp $"); +__KERNEL_RCSID(0, "$NetBSD: db_machdep.c,v 1.4.30.1 2018/03/22 16:59:03 martin Exp $"); #include <sys/param.h> #include <sys/systm.h> @@ -213,11 +213,13 @@ db_frame_info(long *frame, db_addr_t cal if (!strcmp(name, "trap")) { *is_trap = TRAP; narg = 0; - } else if (!strcmp(name, "syscall")) { + } else if (!strcmp(name, "syscall") || + !strcmp(name, "handle_syscall")) { *is_trap = SYSCALL; narg = 0; } else if (name[0] == 'X') { if (!strncmp(name, "Xintr", 5) || + !strncmp(name, "Xhandle", 7) || !strncmp(name, "Xresume", 7) || !strncmp(name, "Xstray", 6) || !strncmp(name, "Xhold", 5) || Index: src/sys/arch/amd64/amd64/genassym.cf diff -u src/sys/arch/amd64/amd64/genassym.cf:1.60.10.1 src/sys/arch/amd64/amd64/genassym.cf:1.60.10.2 --- src/sys/arch/amd64/amd64/genassym.cf:1.60.10.1 Tue Mar 13 15:47:44 2018 +++ src/sys/arch/amd64/amd64/genassym.cf Thu Mar 22 16:59:03 2018 @@ -1,4 +1,4 @@ -# $NetBSD: genassym.cf,v 1.60.10.1 2018/03/13 15:47:44 martin Exp $ +# $NetBSD: genassym.cf,v 1.60.10.2 2018/03/22 16:59:03 martin Exp $ # # Copyright (c) 1998, 2006, 2007, 2008 The NetBSD Foundation, Inc. @@ -236,6 +236,13 @@ define CPU_INFO_CURLDT offsetof(struct define CPU_INFO_IDLELWP offsetof(struct cpu_info, ci_data.cpu_idlelwp) define CPU_INFO_PMAP offsetof(struct cpu_info, ci_pmap) define CPU_INFO_TSS offsetof(struct cpu_info, ci_tss) +ifdef SVS +define CPU_INFO_UPDIRPA offsetof(struct cpu_info, ci_svs_updirpa) +define CPU_INFO_KPDIRPA offsetof(struct cpu_info, ci_svs_kpdirpa) +define CPU_INFO_RSP0 offsetof(struct cpu_info, ci_svs_rsp0) +define CPU_INFO_URSP0 offsetof(struct cpu_info, ci_svs_ursp0) +define CPU_INFO_KRSP0 offsetof(struct cpu_info, ci_svs_krsp0) +endif define CPU_INFO_NSYSCALL offsetof(struct cpu_info, ci_data.cpu_nsyscall) define CPU_INFO_NTRAP offsetof(struct cpu_info, ci_data.cpu_ntrap) define CPU_INFO_NINTR offsetof(struct cpu_info, ci_data.cpu_nintr) Index: src/sys/arch/amd64/amd64/locore.S diff -u src/sys/arch/amd64/amd64/locore.S:1.123.6.4 src/sys/arch/amd64/amd64/locore.S:1.123.6.5 --- src/sys/arch/amd64/amd64/locore.S:1.123.6.4 Tue Mar 13 15:47:44 2018 +++ src/sys/arch/amd64/amd64/locore.S Thu Mar 22 16:59:03 2018 @@ -1,4 +1,4 @@ -/* $NetBSD: locore.S,v 1.123.6.4 2018/03/13 15:47:44 martin Exp $ */ +/* $NetBSD: locore.S,v 1.123.6.5 2018/03/22 16:59:03 martin Exp $ */ /* * Copyright-o-rama! @@ -160,6 +160,7 @@ #include "opt_compat_netbsd32.h" #include "opt_compat_ibcs2.h" #include "opt_xen.h" +#include "opt_svs.h" #include "assym.h" #include "lapic.h" @@ -329,6 +330,9 @@ .globl _C_LABEL(bootinfo) .globl _C_LABEL(biosbasemem) .globl _C_LABEL(biosextmem) + .globl do_mov_es + .globl do_mov_ds + .globl do_iret .type _C_LABEL(tablesize), @object _C_LABEL(tablesize): .long TABLESIZE @@ -1080,6 +1084,16 @@ ENTRY(cpu_switchto) movq %rbp,PCB_RBP(%rax) skip_save: +#ifdef SVS + pushq %rdx + movb _C_LABEL(svs_enabled),%dl + testb %dl,%dl + jz .Lskip_svs + callq _C_LABEL(svs_lwp_switch) +.Lskip_svs: + popq %rdx +#endif + /* Switch to newlwp's stack. */ movq L_PCB(%r12),%r14 movq PCB_RSP(%r14),%rsp @@ -1097,6 +1111,19 @@ skip_save: jnz switch_return /* Switch ring0 stack */ +#ifdef SVS + movb _C_LABEL(svs_enabled),%al + testb %al,%al + jz .Lno_svs_switch + + movq CPUVAR(RSP0),%rax + movq CPUVAR(TSS),%rdi + movq %rax,TSS_RSP0(%rdi) + jmp .Lring0_switched + +.Lno_svs_switch: +#endif + #ifndef XEN movq PCB_RSP0(%r14),%rax movq CPUVAR(TSS),%rdi @@ -1105,6 +1132,7 @@ skip_save: movq %r14,%rdi callq _C_LABEL(x86_64_switch_context); #endif +.Lring0_switched: /* Don't bother with the rest if switching to a system process. */ testl $LW_SYSTEM,L_FLAG(%r12) @@ -1223,74 +1251,12 @@ ENTRY(savectx) ret END(savectx) -IDTVEC(syscall32) - sysret /* go away please */ -IDTVEC_END(syscall32) - /* - * syscall() - * - * syscall insn entry. + * Syscall handler. */ -IDTVEC(syscall) -#ifndef XEN - /* - * The user %rip is in %rcx and the user %rflags in %r11. The kernel %cs - * and %ss are loaded, but nothing else is. - * - * The 'swapgs' instruction gives us access to cpu-specific memory where - * we can save a user register and then read the LWP's kernel stack - * pointer. - * - * This code doesn't seem to set %ds, this may not matter since it is - * ignored in 64bit mode, OTOH the syscall instruction sets %ss and that - * is ignored as well. - */ - swapgs - movq %r15,CPUVAR(SCRATCH) - movq CPUVAR(CURLWP),%r15 - movq L_PCB(%r15),%r15 - movq PCB_RSP0(%r15),%r15 /* LWP's kernel stack pointer */ - - /* Make stack look like an 'int nn' frame */ -#define SP(x) (x)-(TF_SS+8)(%r15) - movq $(LSEL(LUDATA_SEL, SEL_UPL)),SP(TF_SS) /* user %ss */ - movq %rsp,SP(TF_RSP) /* user %rsp */ - movq %r11,SP(TF_RFLAGS) /* user %rflags */ - movq $(LSEL(LUCODE_SEL, SEL_UPL)),SP(TF_CS) /* user %cs */ - movq %rcx,SP(TF_RIP) /* user %rip */ - - leaq SP(0),%rsp /* %rsp now valid after frame */ - movq CPUVAR(SCRATCH),%r15 -#undef SP - - movq $2,TF_ERR(%rsp) /* syscall instruction size */ - movq $T_ASTFLT,TF_TRAPNO(%rsp) - - movw %es,TF_ES(%rsp) - sti - INTR_SAVE_GPRS - movw %fs,TF_FS(%rsp) - movw %gs,TF_GS(%rsp) - movw $(LSEL(LUDATA_SEL, SEL_UPL)),TF_DS(%rsp) -#else - /* Xen already switched to kernel stack */ - pushq %rsi +NENTRY(handle_syscall) STI(si) - popq %rsi - addq $0x10,%rsp /* gap to match cs:rip */ - pushq $2 /* error code */ - pushq $T_ASTFLT - subq $TF_REGSIZE,%rsp - INTR_SAVE_GPRS - cld - movw %fs,TF_FS(%rsp) - movw %gs,TF_GS(%rsp) - movw %es,TF_ES(%rsp) - movw $(LSEL(LUDATA_SEL, SEL_UPL)),TF_DS(%rsp) -#endif -do_syscall: movq CPUVAR(CURLWP),%r14 incq CPUVAR(NSYSCALL) /* count it atomically */ movq %rsp,L_MD_REGS(%r14) /* save pointer to frame */ @@ -1315,32 +1281,18 @@ do_syscall: jne spl_error #endif + /* + * Decide if we need to take a slow path. That's the case when we + * want to reload %cs and %ss on a 64bit LWP (MDL_IRET set), or when + * we're returning to a 32bit LWP (MDL_COMPAT32 set). + * + * In either case, we jump into intrfastexit and return to userland + * with the iret instruction. + */ testl $(MDL_IRET|MDL_COMPAT32),L_MD_FLAGS(%r14) - INTR_RESTORE_GPRS - movw TF_ES(%rsp),%es - movw TF_DS(%rsp),%ds - SWAPGS - jnz 2f -#ifndef XEN - movq TF_RIP(%rsp),%rcx /* %rip for sysret */ - movq TF_RFLAGS(%rsp),%r11 /* %flags for sysret */ - movq TF_RSP(%rsp),%rsp - sysretq -#else - addq $TF_RIP,%rsp - pushq $256 /* VGCF_IN_SYSCALL */ - jmp HYPERVISOR_iret -#endif + jnz intrfastexit -/* - * If the syscall might have modified some registers, or we are a 32bit - * process we must return to user with an 'iret' instruction. - * If the iret faults in kernel (assumed due to illegal register values) - * then a SIGSEGV will be signalled. - */ -2: - addq $TF_RIP,%rsp - iretq + jmp syscall_sysret #ifdef DIAGNOSTIC /* Report SPL error */ @@ -1372,7 +1324,7 @@ spl_error: movq %rsp,%rdi call _C_LABEL(trap) jmp .Lsyscall_checkast /* re-check ASTs */ -IDTVEC_END(syscall) +END(handle_syscall) /* * void lwp_trampoline(void); @@ -1392,10 +1344,96 @@ NENTRY(lwp_trampoline) END(lwp_trampoline) /* + * Entry points of the 'syscall' instruction, 64bit and 32bit mode. + */ + +#define SP(x) (x)-(TF_SS+8)(%rax) + +.macro SYSCALL_ENTRY name,is_svs +IDTVEC(\name) +#ifndef XEN + /* + * The user %rip is in %rcx and the user %rflags in %r11. The kernel %cs + * and %ss are loaded, but nothing else is. + * + * The 'swapgs' instruction gives us access to cpu-specific memory where + * we can save a user register and then read the LWP's kernel stack + * pointer. + * + * This code doesn't seem to set %ds, this may not matter since it is + * ignored in 64bit mode, OTOH the syscall instruction sets %ss and that + * is ignored as well. + */ + swapgs + + /* Get the LWP's kernel stack pointer in %rax */ + .if \is_svs + movq %rax,SVS_UTLS+UTLS_SCRATCH + movq SVS_UTLS+UTLS_RSP0,%rax + .else + movq %rax,CPUVAR(SCRATCH) + movq CPUVAR(CURLWP),%rax + movq L_PCB(%rax),%rax + movq PCB_RSP0(%rax),%rax + .endif + + /* Make stack look like an 'int nn' frame */ + movq $(LSEL(LUDATA_SEL, SEL_UPL)),SP(TF_SS) /* user %ss */ + movq %rsp,SP(TF_RSP) /* user %rsp */ + movq %r11,SP(TF_RFLAGS) /* user %rflags */ + movq $(LSEL(LUCODE_SEL, SEL_UPL)),SP(TF_CS) /* user %cs */ + movq %rcx,SP(TF_RIP) /* user %rip */ + leaq SP(0),%rsp /* %rsp now valid after frame */ + + /* Restore %rax */ + .if \is_svs + movq SVS_UTLS+UTLS_SCRATCH,%rax + .else + movq CPUVAR(SCRATCH),%rax + .endif + + movq $2,TF_ERR(%rsp) /* syscall instruction size */ + movq $T_ASTFLT,TF_TRAPNO(%rsp) +#else + /* Xen already switched to kernel stack */ + addq $0x10,%rsp /* gap to match cs:rip */ + pushq $2 /* error code */ + pushq $T_ASTFLT + subq $TF_REGSIZE,%rsp + cld +#endif + INTR_SAVE_GPRS + movw $(LSEL(LUDATA_SEL, SEL_UPL)),TF_DS(%rsp) + movw %es,TF_ES(%rsp) + movw %fs,TF_FS(%rsp) + movw %gs,TF_GS(%rsp) + .if \is_svs + SVS_ENTER + .endif + jmp handle_syscall +IDTVEC_END(\name) +.endm + +SYSCALL_ENTRY syscall,is_svs=0 + + TEXT_USER_BEGIN + +#ifdef SVS +SYSCALL_ENTRY syscall_svs,is_svs=1 +#endif + +IDTVEC(syscall32) + sysret /* go away please */ +IDTVEC_END(syscall32) + + TEXT_USER_END + +/* * osyscall() * * Trap gate entry for int $80 syscall, also used by sigreturn. */ + TEXT_USER_BEGIN IDTVEC(osyscall) #ifdef XEN movq (%rsp),%rcx @@ -1405,9 +1443,37 @@ IDTVEC(osyscall) pushq $2 /* size of instruction for restart */ pushq $T_ASTFLT /* trap # for doing ASTs */ INTRENTRY - STI(si) - jmp do_syscall + jmp handle_syscall IDTVEC_END(osyscall) + TEXT_USER_END + +/* + * Return to userland via 'sysret'. + */ + TEXT_USER_BEGIN + _ALIGN_TEXT +LABEL(syscall_sysret) + SVS_LEAVE + + /* Set default the 64bit values in %ds and %es. */ + movq $GSEL(GUDATA_SEL, SEL_UPL),%rax + movw %ax,%ds + movw %ax,%es + + INTR_RESTORE_GPRS + SWAPGS +#ifndef XEN + movq TF_RIP(%rsp),%rcx /* %rip for sysret */ + movq TF_RFLAGS(%rsp),%r11 /* %flags for sysret */ + movq TF_RSP(%rsp),%rsp + sysretq +#else + addq $TF_RIP,%rsp + pushq $256 /* VGCF_IN_SYSCALL */ + jmp HYPERVISOR_iret +#endif +END(syscall_sysret) + TEXT_USER_END /* * bool sse2_idlezero_page(void *pg) @@ -1451,7 +1517,6 @@ END(sse2_idlezero_page) * * Zero a page without polluting the cache. */ - ENTRY(pagezero) movq $-PAGE_SIZE,%rdx subq %rdx,%rdi @@ -1471,15 +1536,92 @@ ENTRY(pagezero) ret END(pagezero) + TEXT_USER_BEGIN + +/* + * In intrfastexit, we advance %rsp at the beginning. We then access the + * segment registers in the trapframe with TF_BACKW (backwards). See the + * documentation in amd64_trap.S for an explanation. + */ + +#define TF_BACKW(val, reg) (val - (TF_REGSIZE+16))(reg) + _ALIGN_TEXT LABEL(intrfastexit) - INTR_RESTORE_GPRS - testq $SEL_UPL,TF_CS(%rsp) - je 99f NOT_XEN(cli;) - movw TF_ES(%rsp),%es - movw TF_DS(%rsp),%ds + SVS_LEAVE + INTR_RESTORE_GPRS + addq $(TF_REGSIZE+16),%rsp /* iret frame */ + + testb $SEL_UPL,TF_BACKW(TF_CS, %rsp) + jz .Lkexit SWAPGS -99: addq $TF_REGSIZE+16,%rsp +do_mov_es: + movw TF_BACKW(TF_ES, %rsp),%es +do_mov_ds: + movw TF_BACKW(TF_DS, %rsp),%ds + +.Lkexit: +do_iret: iretq END(intrfastexit) + + TEXT_USER_END + +#ifdef SVS + .globl svs_enter, svs_enter_end + .globl svs_enter_altstack, svs_enter_altstack_end + .globl svs_leave, svs_leave_end + .globl svs_leave_altstack, svs_leave_altstack_end + .globl nosvs_enter, nosvs_enter_end + .globl nosvs_enter_altstack, nosvs_enter_altstack_end + .globl nosvs_leave, nosvs_leave_end + .globl nosvs_leave_altstack, nosvs_leave_altstack_end + +LABEL(svs_enter) + movq SVS_UTLS+UTLS_KPDIRPA,%rax + movq %rax,%cr3 + movq CPUVAR(KRSP0),%rsp +LABEL(svs_enter_end) + +LABEL(svs_enter_altstack) + testb $SEL_UPL,TF_CS(%rsp) + jz 1234f + movq SVS_UTLS+UTLS_KPDIRPA,%rax + movq %rax,%cr3 +1234: +LABEL(svs_enter_altstack_end) + +LABEL(svs_leave) + testb $SEL_UPL,TF_CS(%rsp) + jz 1234f + movq CPUVAR(URSP0),%rsp + movq CPUVAR(UPDIRPA),%rax + movq %rax,%cr3 +1234: +LABEL(svs_leave_end) + +LABEL(svs_leave_altstack) + testb $SEL_UPL,TF_CS(%rsp) + jz 1234f + movq CPUVAR(UPDIRPA),%rax + movq %rax,%cr3 +1234: +LABEL(svs_leave_altstack_end) + +LABEL(nosvs_enter) + NOSVS_ENTER +LABEL(nosvs_enter_end) + +LABEL(nosvs_enter_altstack) + NOSVS_ENTER_ALTSTACK +LABEL(nosvs_enter_altstack_end) + +LABEL(nosvs_leave) + NOSVS_LEAVE +LABEL(nosvs_leave_end) + +LABEL(nosvs_leave_altstack) + NOSVS_LEAVE_ALTSTACK +LABEL(nosvs_leave_altstack_end) +#endif Index: src/sys/arch/amd64/amd64/machdep.c diff -u src/sys/arch/amd64/amd64/machdep.c:1.255.6.5 src/sys/arch/amd64/amd64/machdep.c:1.255.6.6 --- src/sys/arch/amd64/amd64/machdep.c:1.255.6.5 Fri Mar 16 13:17:56 2018 +++ src/sys/arch/amd64/amd64/machdep.c Thu Mar 22 16:59:03 2018 @@ -1,4 +1,4 @@ -/* $NetBSD: machdep.c,v 1.255.6.5 2018/03/16 13:17:56 martin Exp $ */ +/* $NetBSD: machdep.c,v 1.255.6.6 2018/03/22 16:59:03 martin Exp $ */ /*- * Copyright (c) 1996, 1997, 1998, 2000, 2006, 2007, 2008, 2011 @@ -111,7 +111,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.255.6.5 2018/03/16 13:17:56 martin Exp $"); +__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.255.6.6 2018/03/22 16:59:03 martin Exp $"); /* #define XENDEBUG_LOW */ @@ -123,6 +123,7 @@ __KERNEL_RCSID(0, "$NetBSD: machdep.c,v #include "opt_mtrr.h" #include "opt_realmem.h" #include "opt_xen.h" +#include "opt_svs.h" #ifndef XEN #include "opt_physmem.h" #endif @@ -1544,6 +1545,9 @@ init_x86_64(paddr_t first_avail) #endif /* XEN */ cpu_probe(&cpu_info_primary); +#ifdef SVS + svs_init(); +#endif cpu_init_msrs(&cpu_info_primary, true); use_pae = 1; /* PAE always enabled in long mode */ Index: src/sys/arch/amd64/amd64/trap.c diff -u src/sys/arch/amd64/amd64/trap.c:1.96.4.1 src/sys/arch/amd64/amd64/trap.c:1.96.4.2 --- src/sys/arch/amd64/amd64/trap.c:1.96.4.1 Wed Mar 7 14:50:56 2018 +++ src/sys/arch/amd64/amd64/trap.c Thu Mar 22 16:59:03 2018 @@ -1,4 +1,4 @@ -/* $NetBSD: trap.c,v 1.96.4.1 2018/03/07 14:50:56 martin Exp $ */ +/* $NetBSD: trap.c,v 1.96.4.2 2018/03/22 16:59:03 martin Exp $ */ /*- * Copyright (c) 1998, 2000 The NetBSD Foundation, Inc. @@ -68,7 +68,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: trap.c,v 1.96.4.1 2018/03/07 14:50:56 martin Exp $"); +__KERNEL_RCSID(0, "$NetBSD: trap.c,v 1.96.4.2 2018/03/22 16:59:03 martin Exp $"); #include "opt_ddb.h" #include "opt_kgdb.h" @@ -266,9 +266,6 @@ trap(struct trapframe *frame) extern char fusuintrfailure[], kcopy_fault[]; extern char IDTVEC(osyscall)[]; extern char IDTVEC(syscall32)[]; -#ifndef XEN - struct trapframe *vframe; -#endif ksiginfo_t ksi; void *onfault; int type, error; @@ -357,92 +354,7 @@ copyfault: return; } - /* - * Check for failure during return to user mode. - * This can happen loading invalid values into the segment - * registers, or during the 'iret' itself. - * - * We do this by looking at the instruction we faulted on. - * The specific instructions we recognize only happen when - * returning from a trap, syscall, or interrupt. - */ - -kernelfault: -#ifdef XEN - /* - * XXX: there has to be an equivalent 'problem' - * but I (dsl) don't know exactly what happens! - * For now panic the kernel. - */ goto we_re_toast; -#else - KSI_INIT_TRAP(&ksi); - ksi.ksi_signo = SIGSEGV; - ksi.ksi_code = SEGV_ACCERR; - ksi.ksi_trap = type; - - /* Get %rsp value before fault - there may be a pad word - * below the trap frame. */ - vframe = (void *)frame->tf_rsp; - if (frame->tf_rip == 0) { - /* - * Assume that if we jumped to null we - * probably did it via a null function - * pointer, so print the return address. - */ - printf("kernel jumped to null; return addr was %p\n", - *(void **)frame->tf_rsp); - goto we_re_toast; - } - switch (*(uint16_t *)frame->tf_rip) { - case 0xcf48: /* iretq */ - /* - * The 'iretq' instruction faulted, so we have the - * 'user' registers saved after the kernel - * %rip:%cs:%fl:%rsp:%ss of the iret, and below that - * the user %rip:%cs:%fl:%rsp:%ss the 'iret' was - * processing. - * We must copy the user register back over the - * kernel fault frame to generate a normal stack - * frame (eg for sending a SIGSEGV). - */ - vframe = (void *)((char *)vframe - - offsetof(struct trapframe, tf_rip)); - memmove(vframe, frame, - offsetof(struct trapframe, tf_rip)); - /* Set the faulting address to the user %eip */ - ksi.ksi_addr = (void *)vframe->tf_rip; - break; - case 0x848e: /* mov 0xa8(%rsp),%es (8e 84 24 a8 00 00 00) */ - case 0x9c8e: /* mov 0xb0(%rsp),%ds (8e 9c 24 b0 00 00 00) */ -#ifdef USER_LDT - case 0xa48e: /* mov 0xa0(%rsp),%fs (8e a4 24 a0 00 00 00) */ - case 0xac8e: /* mov 0x98(%rsp),%gs (8e ac 24 98 00 00 00) */ -#endif - /* - * We faulted loading one of the user segment registers. - * The stack frame containing the user registers is - * still valid and pointed to by tf_rsp. - * Maybe we should check the iretq follows. - */ - if (KERNELMODE(vframe->tf_cs, vframe->tf_eflags)) - goto we_re_toast; - /* There is no valid address for the fault */ - break; - - default: - goto we_re_toast; - } - - /* XXX: worry about on-stack trampolines for nested - * handlers?? */ - /* Save outer frame for any signal return */ - l->l_md.md_regs = vframe; - (*p->p_emul->e_trapsignal)(l, &ksi); - /* Return to user by reloading the user frame */ - trap_return_fault_return(vframe); - /* NOTREACHED */ -#endif case T_PROTFLT|T_USER: /* protection fault */ #if defined(COMPAT_NETBSD32) && defined(COMPAT_10) @@ -700,7 +612,7 @@ faultcommon: goto copyfault; printf("uvm_fault(%p, 0x%lx, %d) -> %x\n", map, va, ftype, error); - goto kernelfault; + goto we_re_toast; } KSI_INIT_TRAP(&ksi); Index: src/sys/arch/amd64/amd64/vector.S diff -u src/sys/arch/amd64/amd64/vector.S:1.49.2.1 src/sys/arch/amd64/amd64/vector.S:1.49.2.2 --- src/sys/arch/amd64/amd64/vector.S:1.49.2.1 Wed Mar 7 14:50:56 2018 +++ src/sys/arch/amd64/amd64/vector.S Thu Mar 22 16:59:03 2018 @@ -1,4 +1,4 @@ -/* $NetBSD: vector.S,v 1.49.2.1 2018/03/07 14:50:56 martin Exp $ */ +/* $NetBSD: vector.S,v 1.49.2.2 2018/03/22 16:59:03 martin Exp $ */ /*- * Copyright (c) 1998, 2007, 2008 The NetBSD Foundation, Inc. @@ -114,10 +114,7 @@ IDTVEC(recurse_lapic_ipi) INTRENTRY jmp 1f IDTVEC_END(recurse_lapic_ipi) -IDTVEC(intr_x2apic_ipi) - pushq $0 - pushq $T_ASTFLT - INTRENTRY +IDTVEC(handle_x2apic_ipi) movl $(MSR_X2APIC_BASE + MSR_X2APIC_EOI),%ecx xorl %eax,%eax xorl %edx,%edx @@ -126,17 +123,14 @@ IDTVEC(intr_x2apic_ipi) cmpl $IPL_HIGH,%ebx jae 2f jmp 1f -IDTVEC_END(intr_x2apic_ipi) -IDTVEC(intr_lapic_ipi) - pushq $0 - pushq $T_ASTFLT - INTRENTRY +IDTVEC_END(handle_x2apic_ipi) +IDTVEC(handle_lapic_ipi) movq _C_LABEL(local_apic_va),%rbx movl $0,LAPIC_EOI(%rbx) movl CPUVAR(ILEVEL),%ebx cmpl $IPL_HIGH,%ebx jae 2f -IDTVEC_END(intr_lapic_ipi) +IDTVEC_END(handle_lapic_ipi) IDTVEC(resume_lapic_ipi) 1: incl CPUVAR(IDEPTH) @@ -150,12 +144,23 @@ IDTVEC(resume_lapic_ipi) INTRFASTEXIT IDTVEC_END(resume_lapic_ipi) -#if defined(DDB) -IDTVEC(intrddb) -1: + TEXT_USER_BEGIN +IDTVEC(intr_x2apic_ipi) pushq $0 - pushq $T_BPTFLT + pushq $T_ASTFLT + INTRENTRY + jmp _C_LABEL(Xhandle_x2apic_ipi) +IDTVEC_END(intr_x2apic_ipi) +IDTVEC(intr_lapic_ipi) + pushq $0 + pushq $T_ASTFLT INTRENTRY + jmp _C_LABEL(Xhandle_lapic_ipi) +IDTVEC_END(intr_lapic_ipi) + TEXT_USER_END + +#if defined(DDB) +IDTVEC(handle_intrddbipi) movl $0xf,%eax movq %rax,%cr8 movq _C_LABEL(local_apic_va),%rbx @@ -165,13 +170,8 @@ IDTVEC(intrddb) xorl %eax,%eax movq %rax,%cr8 INTRFASTEXIT -IDTVEC_END(intrddb) - -IDTVEC(x2apic_intrddb) -1: - pushq $0 - pushq $T_BPTFLT - INTRENTRY +IDTVEC_END(handle_intrddbipi) +IDTVEC(handle_x2apic_intrddbipi) movl $0xf,%eax movq %rax,%cr8 movl $(MSR_X2APIC_BASE + MSR_X2APIC_EOI),%ecx @@ -183,7 +183,23 @@ IDTVEC(x2apic_intrddb) xorl %eax,%eax movq %rax,%cr8 INTRFASTEXIT +IDTVEC_END(handle_x2apic_intrddbipi) + + TEXT_USER_BEGIN +IDTVEC(intrddb) + pushq $0 + pushq $T_BPTFLT + INTRENTRY + jmp _C_LABEL(Xhandle_intrddbipi) +IDTVEC_END(intrddb) +IDTVEC(x2apic_intrddb) + pushq $0 + pushq $T_BPTFLT + INTRENTRY + jmp _C_LABEL(Xhandle_x2apic_intrddbipi) IDTVEC_END(x2apic_intrddb) + TEXT_USER_END + #endif /* DDB */ #endif /* MULTIPROCESSOR */ @@ -197,10 +213,7 @@ IDTVEC(recurse_lapic_ltimer) INTRENTRY jmp 1f IDTVEC_END(recurse_lapic_ltimer) -IDTVEC(intr_x2apic_ltimer) - pushq $0 - pushq $T_ASTFLT - INTRENTRY +IDTVEC(handle_x2apic_ltimer) movl $(MSR_X2APIC_BASE + MSR_X2APIC_EOI),%ecx xorl %eax,%eax xorl %edx,%edx @@ -209,11 +222,8 @@ IDTVEC(intr_x2apic_ltimer) cmpl $IPL_CLOCK,%ebx jae 2f jmp 1f -IDTVEC_END(intr_x2apic_ltimer) -IDTVEC(intr_lapic_ltimer) - pushq $0 - pushq $T_ASTFLT - INTRENTRY +IDTVEC_END(handle_x2apic_ltimer) +IDTVEC(handle_lapic_ltimer) movq _C_LABEL(local_apic_va),%rbx movl $0,LAPIC_EOI(%rbx) movl CPUVAR(ILEVEL),%ebx @@ -234,33 +244,57 @@ IDTVEC(resume_lapic_ltimer) orl $(1 << LIR_TIMER),CPUVAR(IPENDING) INTRFASTEXIT IDTVEC_END(resume_lapic_ltimer) + + TEXT_USER_BEGIN +IDTVEC(intr_x2apic_ltimer) + pushq $0 + pushq $T_ASTFLT + INTRENTRY + jmp _C_LABEL(Xhandle_x2apic_ltimer) +IDTVEC_END(intr_x2apic_ltimer) +IDTVEC(intr_lapic_ltimer) + pushq $0 + pushq $T_ASTFLT + INTRENTRY + jmp _C_LABEL(Xhandle_lapic_ltimer) +IDTVEC_END(intr_lapic_ltimer) + TEXT_USER_END + #endif /* NLAPIC > 0 */ #ifndef XEN /* * TLB shootdown handler. */ -IDTVEC(intr_lapic_tlb) - pushq $0 - pushq $T_ASTFLT - INTRENTRY +IDTVEC(handle_lapic_tlb) movq _C_LABEL(local_apic_va),%rax movl $0,LAPIC_EOI(%rax) callq _C_LABEL(pmap_tlb_intr) INTRFASTEXIT -IDTVEC_END(intr_lapic_tlb) - -IDTVEC(intr_x2apic_tlb) - pushq $0 - pushq $T_ASTFLT - INTRENTRY +IDTVEC_END(handle_lapic_tlb) +IDTVEC(handle_x2apic_tlb) movl $(MSR_X2APIC_BASE + MSR_X2APIC_EOI),%ecx xorl %eax,%eax xorl %edx,%edx wrmsr callq _C_LABEL(pmap_tlb_intr) INTRFASTEXIT +IDTVEC_END(handle_x2apic_tlb) + + TEXT_USER_BEGIN +IDTVEC(intr_lapic_tlb) + pushq $0 + pushq $T_ASTFLT + INTRENTRY + jmp _C_LABEL(Xhandle_lapic_tlb) +IDTVEC_END(intr_lapic_tlb) +IDTVEC(intr_x2apic_tlb) + pushq $0 + pushq $T_ASTFLT + INTRENTRY + jmp _C_LABEL(Xhandle_x2apic_tlb) IDTVEC_END(intr_x2apic_tlb) + TEXT_USER_END #endif /* !XEN */ @@ -269,7 +303,7 @@ IDTVEC_END(intr_x2apic_tlb) #ifndef XEN /* - * This macro defines the generic stub code. Its arguments modifiy it + * This macro defines the generic stub code. Its arguments modify it * for specific PICs. */ @@ -285,10 +319,7 @@ IDTVEC(resume_ ## name ## num) \ movq CPUVAR(ISOURCES) + (num) * 8,%r14 ;\ movl IS_MAXLEVEL(%r14),%ebx ;\ jmp 1f ;\ -IDTVEC(intr_ ## name ## num) ;\ - pushq $0 /* dummy error code */ ;\ - pushq $T_ASTFLT /* trap # for doing ASTs */ ;\ - INTRENTRY ;\ +IDTVEC(handle_ ## name ## num) ;\ movq CPUVAR(ISOURCES) + (num) * 8,%r14 ;\ mask(num) /* mask it in hardware */ ;\ early_ack(num) /* and allow other intrs */ ;\ @@ -339,7 +370,16 @@ IDTVEC(intr_ ## name ## num) ;\ 9: \ unmask(num) ;\ late_ack(num) ;\ - INTRFASTEXIT + INTRFASTEXIT ;\ +IDTVEC_END(handle_ ## name ## num) ;\ + TEXT_USER_BEGIN ;\ +IDTVEC(intr_ ## name ## num) ;\ + pushq $0 /* dummy error code */ ;\ + pushq $T_ASTFLT /* trap # for doing ASTs */ ;\ + INTRENTRY ;\ + jmp _C_LABEL(Xhandle_ ## name ## num) ;\ +IDTVEC_END(intr_ ## name ## num) ;\ + TEXT_USER_END #define ICUADDR IO_ICU1 Index: src/sys/arch/amd64/conf/GENERIC diff -u src/sys/arch/amd64/conf/GENERIC:1.459.2.5 src/sys/arch/amd64/conf/GENERIC:1.459.2.6 --- src/sys/arch/amd64/conf/GENERIC:1.459.2.5 Sun Feb 11 21:17:34 2018 +++ src/sys/arch/amd64/conf/GENERIC Thu Mar 22 16:59:03 2018 @@ -1,4 +1,4 @@ -# $NetBSD: GENERIC,v 1.459.2.5 2018/02/11 21:17:34 snj Exp $ +# $NetBSD: GENERIC,v 1.459.2.6 2018/03/22 16:59:03 martin Exp $ # # GENERIC machine description file # @@ -22,7 +22,7 @@ include "arch/amd64/conf/std.amd64" options INCLUDE_CONFIG_FILE # embed config file in kernel binary -#ident "GENERIC-$Revision: 1.459.2.5 $" +#ident "GENERIC-$Revision: 1.459.2.6 $" maxusers 64 # estimated number of users @@ -73,6 +73,9 @@ options USERCONF # userconf(4) support #options PIPE_SOCKETPAIR # smaller, but slower pipe(2) options SYSCTL_INCLUDE_DESCR # Include sysctl descriptions in kernel +# CPU-related options +#options SVS # Separate Virtual Space + # CPU features acpicpu* at cpu? # ACPI CPU (including frequency scaling) coretemp* at cpu? # Intel on-die thermal sensor Index: src/sys/arch/amd64/conf/kern.ldscript diff -u src/sys/arch/amd64/conf/kern.ldscript:1.22.6.2 src/sys/arch/amd64/conf/kern.ldscript:1.22.6.3 --- src/sys/arch/amd64/conf/kern.ldscript:1.22.6.2 Tue Mar 6 10:17:11 2018 +++ src/sys/arch/amd64/conf/kern.ldscript Thu Mar 22 16:59:03 2018 @@ -1,4 +1,4 @@ -/* $NetBSD: kern.ldscript,v 1.22.6.2 2018/03/06 10:17:11 martin Exp $ */ +/* $NetBSD: kern.ldscript,v 1.22.6.3 2018/03/22 16:59:03 martin Exp $ */ #include "assym.h" @@ -15,6 +15,12 @@ SECTIONS { .text : { + . = ALIGN(__PAGE_SIZE); + __text_user_start = . ; + *(.text.user) + . = ALIGN(__PAGE_SIZE); + __text_user_end = . ; + *(.text) *(.text.*) *(.stub) Index: src/sys/arch/amd64/include/frameasm.h diff -u src/sys/arch/amd64/include/frameasm.h:1.20.32.1 src/sys/arch/amd64/include/frameasm.h:1.20.32.2 --- src/sys/arch/amd64/include/frameasm.h:1.20.32.1 Wed Mar 7 14:50:57 2018 +++ src/sys/arch/amd64/include/frameasm.h Thu Mar 22 16:59:03 2018 @@ -1,10 +1,11 @@ -/* $NetBSD: frameasm.h,v 1.20.32.1 2018/03/07 14:50:57 martin Exp $ */ +/* $NetBSD: frameasm.h,v 1.20.32.2 2018/03/22 16:59:03 martin Exp $ */ #ifndef _AMD64_MACHINE_FRAMEASM_H #define _AMD64_MACHINE_FRAMEASM_H #ifdef _KERNEL_OPT #include "opt_xen.h" +#include "opt_svs.h" #endif /* @@ -35,6 +36,19 @@ #define STI(temp_reg) sti #endif /* XEN */ +#define HP_NAME_SVS_ENTER 5 +#define HP_NAME_SVS_LEAVE 6 +#define HP_NAME_SVS_ENTER_ALT 7 +#define HP_NAME_SVS_LEAVE_ALT 8 + +#define HOTPATCH(name, size) \ +123: ; \ + .pushsection .rodata.hotpatch, "a" ; \ + .byte name ; \ + .byte size ; \ + .quad 123b ; \ + .popsection + #define SWAPGS NOT_XEN(swapgs) /* @@ -74,21 +88,68 @@ movq TF_RBX(%rsp),%rbx ; \ movq TF_RAX(%rsp),%rax -#define INTRENTRY_L(kernel_trap, usertrap) \ +#define TEXT_USER_BEGIN .pushsection .text.user, "ax" +#define TEXT_USER_END .popsection + +#ifdef SVS + +/* XXX: put this somewhere else */ +#define SVS_UTLS 0xffffc00000000000 /* PMAP_PCPU_BASE */ +#define UTLS_KPDIRPA 0 +#define UTLS_SCRATCH 8 +#define UTLS_RSP0 16 + +#define SVS_ENTER_BYTES 22 +#define NOSVS_ENTER \ + .byte 0xEB, (SVS_ENTER_BYTES-2) /* jmp */ ; \ + .fill (SVS_ENTER_BYTES-2),1,0xCC +#define SVS_ENTER \ + HOTPATCH(HP_NAME_SVS_ENTER, SVS_ENTER_BYTES) ; \ + NOSVS_ENTER + +#define SVS_LEAVE_BYTES 31 +#define NOSVS_LEAVE \ + .byte 0xEB, (SVS_LEAVE_BYTES-2) /* jmp */ ; \ + .fill (SVS_LEAVE_BYTES-2),1,0xCC +#define SVS_LEAVE \ + HOTPATCH(HP_NAME_SVS_LEAVE, SVS_LEAVE_BYTES) ; \ + NOSVS_LEAVE + +#define SVS_ENTER_ALT_BYTES 23 +#define NOSVS_ENTER_ALTSTACK \ + .byte 0xEB, (SVS_ENTER_ALT_BYTES-2) /* jmp */ ; \ + .fill (SVS_ENTER_ALT_BYTES-2),1,0xCC +#define SVS_ENTER_ALTSTACK \ + HOTPATCH(HP_NAME_SVS_ENTER_ALT, SVS_ENTER_ALT_BYTES) ; \ + NOSVS_ENTER_ALTSTACK + +#define SVS_LEAVE_ALT_BYTES 22 +#define NOSVS_LEAVE_ALTSTACK \ + .byte 0xEB, (SVS_LEAVE_ALT_BYTES-2) /* jmp */ ; \ + .fill (SVS_LEAVE_ALT_BYTES-2),1,0xCC +#define SVS_LEAVE_ALTSTACK \ + HOTPATCH(HP_NAME_SVS_LEAVE_ALT, SVS_LEAVE_ALT_BYTES) ; \ + NOSVS_LEAVE_ALTSTACK + +#else +#define SVS_ENTER /* nothing */ +#define SVS_LEAVE /* nothing */ +#define SVS_ENTER_ALTSTACK /* nothing */ +#define SVS_LEAVE_ALTSTACK /* nothing */ +#endif + +#define INTRENTRY \ subq $TF_REGSIZE,%rsp ; \ INTR_SAVE_GPRS ; \ cld ; \ testb $SEL_UPL,TF_CS(%rsp) ; \ - je kernel_trap ; \ -usertrap ; \ + je 98f ; \ SWAPGS ; \ + SVS_ENTER ; \ movw %gs,TF_GS(%rsp) ; \ movw %fs,TF_FS(%rsp) ; \ movw %es,TF_ES(%rsp) ; \ - movw %ds,TF_DS(%rsp) - -#define INTRENTRY \ - INTRENTRY_L(98f,) ; \ + movw %ds,TF_DS(%rsp) ; \ 98: #define INTRFASTEXIT \ Index: src/sys/arch/amd64/include/param.h diff -u src/sys/arch/amd64/include/param.h:1.21.6.1 src/sys/arch/amd64/include/param.h:1.21.6.2 --- src/sys/arch/amd64/include/param.h:1.21.6.1 Fri Mar 16 13:17:56 2018 +++ src/sys/arch/amd64/include/param.h Thu Mar 22 16:59:03 2018 @@ -1,4 +1,4 @@ -/* $NetBSD: param.h,v 1.21.6.1 2018/03/16 13:17:56 martin Exp $ */ +/* $NetBSD: param.h,v 1.21.6.2 2018/03/22 16:59:03 martin Exp $ */ #ifdef __x86_64__ @@ -53,9 +53,9 @@ #define SSIZE 1 /* initial stack size/NBPG */ #define SINCR 1 /* increment of stack/NBPG */ #ifdef DIAGNOSTIC -#define UPAGES 4 /* pages of u-area (1 for redzone) */ +#define UPAGES 5 /* pages of u-area (1 for redzone) */ #else -#define UPAGES 3 /* pages of u-area */ +#define UPAGES 4 /* pages of u-area */ #endif #define USPACE (UPAGES * NBPG) /* total size of u-area */ Index: src/sys/arch/amd64/include/pmap.h diff -u src/sys/arch/amd64/include/pmap.h:1.39 src/sys/arch/amd64/include/pmap.h:1.39.8.1 --- src/sys/arch/amd64/include/pmap.h:1.39 Fri Nov 11 12:06:31 2016 +++ src/sys/arch/amd64/include/pmap.h Thu Mar 22 16:59:03 2018 @@ -1,4 +1,4 @@ -/* $NetBSD: pmap.h,v 1.39 2016/11/11 12:06:31 maxv Exp $ */ +/* $NetBSD: pmap.h,v 1.39.8.1 2018/03/22 16:59:03 martin Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. @@ -218,6 +218,12 @@ */ #define NPTECL 8 +void svs_pmap_sync(struct pmap *, int); +void svs_lwp_switch(struct lwp *, struct lwp *); +void svs_pdir_switch(struct pmap *); +void svs_init(void); +extern bool svs_enabled; + #include <x86/pmap.h> #ifndef XEN Index: src/sys/arch/x86/conf/files.x86 diff -u src/sys/arch/x86/conf/files.x86:1.88 src/sys/arch/x86/conf/files.x86:1.88.6.1 --- src/sys/arch/x86/conf/files.x86:1.88 Fri Mar 10 14:40:56 2017 +++ src/sys/arch/x86/conf/files.x86 Thu Mar 22 16:59:03 2018 @@ -1,4 +1,4 @@ -# $NetBSD: files.x86,v 1.88 2017/03/10 14:40:56 maxv Exp $ +# $NetBSD: files.x86,v 1.88.6.1 2018/03/22 16:59:03 martin Exp $ # options for MP configuration through the MP spec defflag opt_mpbios.h MPBIOS MPVERBOSE MPDEBUG MPBIOS_SCANPCI @@ -16,6 +16,8 @@ defflag opt_pcifixup.h PCI_ADDR_FIXUP PC # To be able to test for NetBSD/xen in shared files defflag opt_xen.h DO_NOT_DEFINE +defflag SVS + define cpubus { [apid = -1] } define cpufeaturebus {} define ioapicbus { [apid = -1] } @@ -90,6 +92,7 @@ file arch/x86/x86/pmap.c machdep file arch/x86/x86/pmap_tlb.c machdep file arch/x86/x86/pmc.c machdep file arch/x86/x86/procfs_machdep.c procfs +file arch/x86/x86/svs.c machdep & svs file arch/x86/x86/sys_machdep.c machdep file arch/x86/x86/syscall.c machdep file arch/x86/x86/tsc.c machdep Index: src/sys/arch/x86/include/cpu.h diff -u src/sys/arch/x86/include/cpu.h:1.71.2.3 src/sys/arch/x86/include/cpu.h:1.71.2.4 --- src/sys/arch/x86/include/cpu.h:1.71.2.3 Fri Mar 16 13:17:56 2018 +++ src/sys/arch/x86/include/cpu.h Thu Mar 22 16:59:04 2018 @@ -1,4 +1,4 @@ -/* $NetBSD: cpu.h,v 1.71.2.3 2018/03/16 13:17:56 martin Exp $ */ +/* $NetBSD: cpu.h,v 1.71.2.4 2018/03/22 16:59:04 martin Exp $ */ /*- * Copyright (c) 1990 The Regents of the University of California. @@ -47,6 +47,7 @@ #if defined(_KERNEL) || defined(_KMEMUSER) #if defined(_KERNEL_OPT) #include "opt_xen.h" +#include "opt_svs.h" #ifdef i386 #include "opt_user_ldt.h" #include "opt_vm86.h" @@ -197,6 +198,18 @@ struct cpu_info { pd_entry_t * ci_pae_l3_pdir; /* VA pointer to L3 PD */ #endif +#ifdef SVS + pd_entry_t * ci_svs_updir; + paddr_t ci_svs_updirpa; + paddr_t ci_svs_kpdirpa; + kmutex_t ci_svs_mtx; + pd_entry_t * ci_svs_rsp0_pte; + vaddr_t ci_svs_rsp0; + vaddr_t ci_svs_ursp0; + vaddr_t ci_svs_krsp0; + vaddr_t ci_svs_utls; +#endif + #if defined(XEN) && (defined(PAE) || defined(__x86_64__)) /* Currently active user PGD (can't use rcr3() with Xen) */ pd_entry_t * ci_kpm_pdir; /* per-cpu PMD (va) */ @@ -342,6 +355,7 @@ void cpu_broadcast_halt(void); void cpu_kick(struct cpu_info *); void cpu_pcpuarea_init(struct cpu_info *); +void cpu_svs_init(struct cpu_info *); #define curcpu() x86_curcpu() #define curlwp x86_curlwp() Index: src/sys/arch/x86/include/pmap.h diff -u src/sys/arch/x86/include/pmap.h:1.64.6.1 src/sys/arch/x86/include/pmap.h:1.64.6.2 --- src/sys/arch/x86/include/pmap.h:1.64.6.1 Fri Mar 16 13:17:56 2018 +++ src/sys/arch/x86/include/pmap.h Thu Mar 22 16:59:04 2018 @@ -1,4 +1,4 @@ -/* $NetBSD: pmap.h,v 1.64.6.1 2018/03/16 13:17:56 martin Exp $ */ +/* $NetBSD: pmap.h,v 1.64.6.2 2018/03/22 16:59:04 martin Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. @@ -126,9 +126,13 @@ struct pcpu_entry { uint8_t ist0[PAGE_SIZE]; uint8_t ist1[PAGE_SIZE]; uint8_t ist2[PAGE_SIZE]; + uint8_t rsp0[2 * PAGE_SIZE]; } __packed; struct pcpu_area { +#ifdef SVS + uint8_t utls[PAGE_SIZE]; +#endif uint8_t idt[PAGE_SIZE]; uint8_t ldt[PAGE_SIZE]; struct pcpu_entry ent[MAXCPUS]; Index: src/sys/arch/x86/x86/cpu.c diff -u src/sys/arch/x86/x86/cpu.c:1.130.2.4 src/sys/arch/x86/x86/cpu.c:1.130.2.5 --- src/sys/arch/x86/x86/cpu.c:1.130.2.4 Fri Mar 16 13:17:56 2018 +++ src/sys/arch/x86/x86/cpu.c Thu Mar 22 16:59:04 2018 @@ -1,4 +1,4 @@ -/* $NetBSD: cpu.c,v 1.130.2.4 2018/03/16 13:17:56 martin Exp $ */ +/* $NetBSD: cpu.c,v 1.130.2.5 2018/03/22 16:59:04 martin Exp $ */ /*- * Copyright (c) 2000-2012 NetBSD Foundation, Inc. @@ -62,12 +62,13 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.130.2.4 2018/03/16 13:17:56 martin Exp $"); +__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.130.2.5 2018/03/22 16:59:04 martin Exp $"); #include "opt_ddb.h" #include "opt_mpbios.h" /* for MPDEBUG */ #include "opt_mtrr.h" #include "opt_multiprocessor.h" +#include "opt_svs.h" #include "lapic.h" #include "ioapic.h" @@ -410,6 +411,10 @@ cpu_attach(device_t parent, device_t sel KASSERT(ci->ci_data.cpu_idlelwp != NULL); } +#ifdef SVS + cpu_svs_init(ci); +#endif + pmap_reference(pmap_kernel()); ci->ci_pmap = pmap_kernel(); ci->ci_tlbstate = TLBSTATE_STALE; @@ -597,6 +602,9 @@ cpu_init(struct cpu_info *ci) * hardware supports it. */ if (cpu_feature[0] & CPUID_PGE) +#ifdef SVS + if (!svs_enabled) +#endif cr4 |= CR4_PGE; /* enable global TLB caching */ /* @@ -1071,7 +1079,7 @@ mp_cpu_start_cleanup(struct cpu_info *ci #ifdef __x86_64__ typedef void (vector)(void); -extern vector Xsyscall, Xsyscall32; +extern vector Xsyscall, Xsyscall32, Xsyscall_svs; #endif void @@ -1085,6 +1093,11 @@ cpu_init_msrs(struct cpu_info *ci, bool wrmsr(MSR_CSTAR, (uint64_t)Xsyscall32); wrmsr(MSR_SFMASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); +#ifdef SVS + if (svs_enabled) + wrmsr(MSR_LSTAR, (uint64_t)Xsyscall_svs); +#endif + if (full) { wrmsr(MSR_FSBASE, 0); wrmsr(MSR_GSBASE, (uint64_t)ci); @@ -1245,6 +1258,10 @@ x86_cpu_idle_halt(void) void cpu_load_pmap(struct pmap *pmap, struct pmap *oldpmap) { +#ifdef SVS + svs_pdir_switch(pmap); +#endif + #ifdef PAE struct cpu_info *ci = curcpu(); bool interrupts_enabled; Index: src/sys/arch/x86/x86/pmap.c diff -u src/sys/arch/x86/x86/pmap.c:1.245.6.5 src/sys/arch/x86/x86/pmap.c:1.245.6.6 --- src/sys/arch/x86/x86/pmap.c:1.245.6.5 Fri Mar 16 13:17:56 2018 +++ src/sys/arch/x86/x86/pmap.c Thu Mar 22 16:59:04 2018 @@ -1,4 +1,4 @@ -/* $NetBSD: pmap.c,v 1.245.6.5 2018/03/16 13:17:56 martin Exp $ */ +/* $NetBSD: pmap.c,v 1.245.6.6 2018/03/22 16:59:04 martin Exp $ */ /*- * Copyright (c) 2008, 2010, 2016, 2017 The NetBSD Foundation, Inc. @@ -171,12 +171,13 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.245.6.5 2018/03/16 13:17:56 martin Exp $"); +__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.245.6.6 2018/03/22 16:59:04 martin Exp $"); #include "opt_user_ldt.h" #include "opt_lockdebug.h" #include "opt_multiprocessor.h" #include "opt_xen.h" +#include "opt_svs.h" #include <sys/param.h> #include <sys/systm.h> @@ -2051,31 +2052,30 @@ pmap_free_ptp(struct pmap *pmap, struct do { index = pl_i(va, level + 1); opde = pmap_pte_testset(&pdes[level - 1][index], 0); -#if defined(XEN) -# if defined(__x86_64__) + /* - * If ptp is a L3 currently mapped in kernel space, - * on any cpu, clear it before freeing + * On Xen-amd64 or SVS, we need to sync the top level page + * directory on each CPU. */ +#if defined(XEN) && defined(__x86_64__) if (level == PTP_LEVELS - 1) { - /* - * Update the per-cpu PD on all cpus the current - * pmap is active on - */ xen_kpm_sync(pmap, index); } -# endif /*__x86_64__ */ +#elif defined(SVS) + if (svs_enabled && level == PTP_LEVELS - 1) { + svs_pmap_sync(pmap, index); + } +#endif + invaladdr = level == 1 ? (vaddr_t)ptes : (vaddr_t)pdes[level - 2]; pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, opde, TLBSHOOT_FREE_PTP1); + +#if defined(XEN) pmap_tlb_shootnow(); -#else /* XEN */ - invaladdr = level == 1 ? (vaddr_t)ptes : - (vaddr_t)pdes[level - 2]; - pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, - opde, TLBSHOOT_FREE_PTP1); -#endif /* XEN */ +#endif + pmap_freepage(pmap, ptp, level); if (level < PTP_LEVELS - 1) { ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1); @@ -2157,15 +2157,19 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t pa = VM_PAGE_TO_PHYS(ptp); pmap_pte_set(&pva[index], (pd_entry_t) (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V)); + + /* + * On Xen-amd64 or SVS, we need to sync the top level page + * directory on each CPU. + */ #if defined(XEN) && defined(__x86_64__) if (i == PTP_LEVELS) { - - /* - * Update the per-cpu PD on all cpus the current - * pmap is active on - */ xen_kpm_sync(pmap, index); } +#elif defined(SVS) + if (svs_enabled && i == PTP_LEVELS) { + svs_pmap_sync(pmap, index); + } #endif pmap_pte_flush(); pmap_stats_update(pmap, 1, 0); Index: src/sys/arch/x86/x86/vm_machdep.c diff -u src/sys/arch/x86/x86/vm_machdep.c:1.28.6.2 src/sys/arch/x86/x86/vm_machdep.c:1.28.6.3 --- src/sys/arch/x86/x86/vm_machdep.c:1.28.6.2 Sat Mar 17 11:23:18 2018 +++ src/sys/arch/x86/x86/vm_machdep.c Thu Mar 22 16:59:04 2018 @@ -1,4 +1,4 @@ -/* $NetBSD: vm_machdep.c,v 1.28.6.2 2018/03/17 11:23:18 martin Exp $ */ +/* $NetBSD: vm_machdep.c,v 1.28.6.3 2018/03/22 16:59:04 martin Exp $ */ /*- * Copyright (c) 1982, 1986 The Regents of the University of California. @@ -80,7 +80,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: vm_machdep.c,v 1.28.6.2 2018/03/17 11:23:18 martin Exp $"); +__KERNEL_RCSID(0, "$NetBSD: vm_machdep.c,v 1.28.6.3 2018/03/22 16:59:04 martin Exp $"); #include "opt_mtrr.h" @@ -178,9 +178,16 @@ cpu_lwp_fork(struct lwp *l1, struct lwp * returns normally. */ uv = uvm_lwp_getuarea(l2); + KASSERT(uv % PAGE_SIZE == 0); #ifdef __x86_64__ - pcb2->pcb_rsp0 = (uv + USPACE - 16) & ~0xf; +#ifdef SVS + pcb2->pcb_rsp0 = (uv + USPACE - PAGE_SIZE + + sizeof(struct trapframe)); + KASSERT((pcb2->pcb_rsp0 & 0xF) == 0); +#else + pcb2->pcb_rsp0 = (uv + USPACE - 16); +#endif tf = (struct trapframe *)pcb2->pcb_rsp0 - 1; #else pcb2->pcb_esp0 = (uv + USPACE - 16); Index: src/sys/arch/x86/x86/x86_machdep.c diff -u src/sys/arch/x86/x86/x86_machdep.c:1.91.4.1 src/sys/arch/x86/x86/x86_machdep.c:1.91.4.2 --- src/sys/arch/x86/x86/x86_machdep.c:1.91.4.1 Wed Jun 21 17:41:50 2017 +++ src/sys/arch/x86/x86/x86_machdep.c Thu Mar 22 16:59:04 2018 @@ -1,4 +1,4 @@ -/* $NetBSD: x86_machdep.c,v 1.91.4.1 2017/06/21 17:41:50 snj Exp $ */ +/* $NetBSD: x86_machdep.c,v 1.91.4.2 2018/03/22 16:59:04 martin Exp $ */ /*- * Copyright (c) 2002, 2006, 2007 YAMAMOTO Takashi, @@ -31,11 +31,12 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: x86_machdep.c,v 1.91.4.1 2017/06/21 17:41:50 snj Exp $"); +__KERNEL_RCSID(0, "$NetBSD: x86_machdep.c,v 1.91.4.2 2018/03/22 16:59:04 martin Exp $"); #include "opt_modular.h" #include "opt_physmem.h" #include "opt_splash.h" +#include "opt_svs.h" #include <sys/types.h> #include <sys/param.h> @@ -1179,6 +1180,22 @@ SYSCTL_SETUP(sysctl_machdep_setup, "sysc NULL, 0, &use_pae, 0, CTL_MACHDEP, CTL_CREATE, CTL_EOL); +#ifdef SVS + int sysctl_machdep_svs_enabled(SYSCTLFN_ARGS); + const struct sysctlnode *svs_rnode = NULL; + sysctl_createv(clog, 0, NULL, &svs_rnode, + CTLFLAG_PERMANENT, + CTLTYPE_NODE, "svs", NULL, + NULL, 0, NULL, 0, + CTL_MACHDEP, CTL_CREATE); + sysctl_createv(clog, 0, &svs_rnode, &svs_rnode, + CTLFLAG_READWRITE, + CTLTYPE_BOOL, "enabled", + SYSCTL_DESCR("Whether the kernel uses SVS"), + sysctl_machdep_svs_enabled, 0, &svs_enabled, 0, + CTL_CREATE, CTL_EOL); +#endif + /* None of these can ever change once the system has booted */ const_sysctl(clog, "fpu_present", CTLTYPE_INT, i386_fpu_present, CPU_FPU_PRESENT); Index: src/sys/arch/xen/conf/files.compat diff -u src/sys/arch/xen/conf/files.compat:1.25.8.1 src/sys/arch/xen/conf/files.compat:1.25.8.2 --- src/sys/arch/xen/conf/files.compat:1.25.8.1 Tue Aug 1 23:18:30 2017 +++ src/sys/arch/xen/conf/files.compat Thu Mar 22 16:59:04 2018 @@ -1,4 +1,4 @@ -# $NetBSD: files.compat,v 1.25.8.1 2017/08/01 23:18:30 snj Exp $ +# $NetBSD: files.compat,v 1.25.8.2 2018/03/22 16:59:04 martin Exp $ # NetBSD: files.x86,v 1.10 2003/10/08 17:30:00 bouyer Exp # options for MP configuration through the MP spec @@ -29,6 +29,7 @@ defflag opt_pcifixup.h XXXOPT_PCIFIXUP defflag opt_vm86.h XXXVM86 defflag opt_pmc.h XXXPMC +defflag opt_svs.h XXXSVS # User-settable LDT (used by WINE) defflag opt_user_ldt.h XXXUSER_LDT Added files: Index: src/sys/arch/x86/x86/svs.c diff -u /dev/null src/sys/arch/x86/x86/svs.c:1.14.2.2 --- /dev/null Thu Mar 22 16:59:04 2018 +++ src/sys/arch/x86/x86/svs.c Thu Mar 22 16:59:04 2018 @@ -0,0 +1,753 @@ +/* $NetBSD: svs.c,v 1.14.2.2 2018/03/22 16:59:04 martin Exp $ */ + +/* + * Copyright (c) 2018 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Maxime Villard. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__KERNEL_RCSID(0, "$NetBSD: svs.c,v 1.14.2.2 2018/03/22 16:59:04 martin Exp $"); + +#include "opt_svs.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/cpu.h> +#include <sys/sysctl.h> +#include <sys/xcall.h> + +#include <x86/cputypes.h> +#include <machine/cpuvar.h> +#include <machine/frameasm.h> + +#include <uvm/uvm.h> +#include <uvm/uvm_page.h> + +/* + * Separate Virtual Space + * + * A per-cpu L4 page is maintained in ci_svs_updirpa. During each context + * switch to a user pmap, the lower half of updirpa is populated with the + * entries containing the userland pages. + * + * ~~~~~~~~~~ The UTLS Page ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * We use a special per-cpu page that we call UTLS, for User Thread Local + * Storage. Each CPU has one UTLS page. This page has two VAs: + * + * o When the user page tables are loaded in CR3, the VA to access this + * page is &pcpuarea->utls, defined as SVS_UTLS in assembly. This VA is + * _constant_ across CPUs, but in the user page tables this VA points to + * the physical page of the UTLS that is _local_ to the CPU. + * + * o When the kernel page tables are loaded in CR3, the VA to access this + * page is ci->ci_svs_utls. + * + * +----------------------------------------------------------------------+ + * | CPU0 Local Data (Physical Page) | + * | +------------------+ +-------------+ | + * | | User Page Tables | SVS_UTLS ---------------------> | cpu0's UTLS | | + * | +------------------+ +-------------+ | + * +-------------------------------------------------------------^--------+ + * | + * +----------+ + * | + * +----------------------------------------------------------------------+ | + * | CPU1 Local Data (Physical Page) | | + * | +------------------+ +-------------+ | | + * | | User Page Tables | SVS_UTLS ---------------------> | cpu1's UTLS | | | + * | +------------------+ +-------------+ | | + * +-------------------------------------------------------------^--------+ | + * | | + * +------------------+ /----------------------+ | + * | Kern Page Tables | ci->ci_svs_utls | + * +------------------+ \---------------------------------+ + * + * The goal of the UTLS page is to provide an area where we can store whatever + * we want, in a way that it is accessible both when the Kernel and when the + * User page tables are loaded in CR3. + * + * We store in the UTLS page three 64bit values: + * + * o UTLS_KPDIRPA: the value we must put in CR3 in order to load the kernel + * page tables. + * + * o UTLS_SCRATCH: a dummy place where we temporarily store a value during + * the syscall entry procedure. + * + * o UTLS_RSP0: the value we must put in RSP in order to have a stack where + * we can push the register states. This is used only during the syscall + * entry procedure, because there the CPU does not automatically switch + * RSP (it does not use the TSS.rsp0 mechanism described below). + * + * ~~~~~~~~~~ The Stack Switching Mechanism Without SVS ~~~~~~~~~~~~~~~~~~~~~~ + * + * The kernel stack is per-lwp (pcb_rsp0). When doing a context switch between + * two user LWPs, the kernel updates TSS.rsp0 (which is per-cpu) to point to + * the stack of the new LWP. Then the execution continues. At some point, the + * user LWP we context-switched to will perform a syscall or will receive an + * interrupt. There, the CPU will automatically read TSS.rsp0 and use it as a + * stack. The kernel then pushes the register states on this stack, and + * executes in kernel mode normally. + * + * TSS.rsp0 is used by the CPU only during ring3->ring0 transitions. Therefore, + * when an interrupt is received while we were in kernel mode, the CPU does not + * read TSS.rsp0. Instead, it just uses the current stack. + * + * ~~~~~~~~~~ The Stack Switching Mechanism With SVS ~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * In the pcpu_area structure, pointed to by the "pcpuarea" variable, each CPU + * has a two-page rsp0 entry (pcpuarea->ent[cid].rsp0). These two pages do + * _not_ have associated physical addresses. They are only two VAs. + * + * The first page is unmapped and acts as a redzone. The second page is + * dynamically kentered into the highest page of the real per-lwp kernel stack; + * but pay close attention, it is kentered _only_ in the user page tables. + * That is to say, the VA of this second page is mapped when the user page + * tables are loaded, but not mapped when the kernel page tables are loaded. + * + * During a context switch, svs_lwp_switch() gets called first. This function + * does the kenter job described above, not in the kernel page tables (that + * are currently loaded), but in the user page tables (that are not loaded). + * + * VIRTUAL ADDRESSES PHYSICAL ADDRESSES + * + * +-----------------------------+ + * | KERNEL PAGE TABLES | + * | +-------------------+ | +-------------------+ + * | | pcb_rsp0 (page 0) | ------------------> | pcb_rsp0 (page 0) | + * | +-------------------+ | +-------------------+ + * | | pcb_rsp0 (page 1) | ------------------> | pcb_rsp0 (page 1) | + * | +-------------------+ | +-------------------+ + * | | pcb_rsp0 (page 2) | ------------------> | pcb_rsp0 (page 2) | + * | +-------------------+ | +-------------------+ + * | | pcb_rsp0 (page 3) | ------------------> | pcb_rsp0 (page 3) | + * | +-------------------+ | +-> +-------------------+ + * +-----------------------------+ | + * | + * +---------------------------------------+ | + * | USER PAGE TABLES | | + * | +----------------------------------+ | | + * | | pcpuarea->ent[cid].rsp0 (page 0) | | | + * | +----------------------------------+ | | + * | | pcpuarea->ent[cid].rsp0 (page 1) | ----+ + * | +----------------------------------+ | + * +---------------------------------------+ + * + * After svs_lwp_switch() gets called, we set pcpuarea->ent[cid].rsp0 (page 1) + * in TSS.rsp0. Later, when returning to userland on the lwp we context- + * switched to, we will load the user page tables and execute in userland + * normally. + * + * Next time an interrupt or syscall is received, the CPU will automatically + * use TSS.rsp0 as a stack. Here it is executing with the user page tables + * loaded, and therefore TSS.rsp0 is _mapped_. + * + * As part of the kernel entry procedure, we now switch CR3 to load the kernel + * page tables. Here, we are still using the stack pointer we set in TSS.rsp0. + * + * Remember that it was only one page of stack which was mapped only in the + * user page tables. We just switched to the kernel page tables, so we must + * update RSP to be the real per-lwp kernel stack (pcb_rsp0). And we do so, + * without touching the stack (since it is now unmapped, touching it would + * fault). + * + * After we updated RSP, we can continue execution exactly as in the non-SVS + * case. We don't need to copy the values the CPU pushed on TSS.rsp0: even if + * we updated RSP to a totally different VA, this VA points to the same + * physical page as TSS.rsp0. So in the end, the values the CPU pushed are + * still here even with the new RSP. + * + * Thanks to this double-kenter optimization, we don't need to copy the + * trapframe during each user<->kernel transition. + * + * ~~~~~~~~~~ Notes On Locking And Synchronization ~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * o Touching ci_svs_updir without holding ci_svs_mtx first is *not* + * allowed. + * + * o pm_kernel_cpus contains the set of CPUs that have the pmap loaded + * in their CR3 register. It must *not* be replaced by pm_cpus. + * + * o When a context switch on the current CPU is made from a user LWP + * towards a kernel LWP, CR3 is not updated. Therefore, the pmap's + * pm_kernel_cpus still contains the current CPU. It implies that the + * remote CPUs that execute other threads of the user process we just + * left will keep synchronizing us against their changes. + * + * ~~~~~~~~~~ List Of Areas That Are Removed From Userland ~~~~~~~~~~~~~~~~~~~ + * + * o PTE Space + * o Direct Map + * o Remote PCPU Areas + * o Kernel Heap + * o Kernel Image + * + * ~~~~~~~~~~ Todo List ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * Ordered from highest priority to lowest: + * + * o The NMI stack is not double-entered. Therefore if we ever receive an NMI + * and leave it, the content of the stack will be visible to userland (via + * Meltdown). Normally we never leave NMIs, unless a privileged user + * launched PMCs. That's unlikely to happen, our PMC support is pretty + * minimal, and privileged only. + * + * o Narrow down the entry points: hide the 'jmp handler' instructions. This + * makes sense on GENERIC_KASLR kernels. + * + * o Right now there is only one global LDT, and that's not compatible with + * USER_LDT. + */ + +bool svs_enabled __read_mostly = false; + +struct svs_utls { + paddr_t kpdirpa; + uint64_t scratch; + vaddr_t rsp0; +}; + +static pd_entry_t * +svs_tree_add(struct cpu_info *ci, vaddr_t va) +{ + extern const vaddr_t ptp_masks[]; + extern const int ptp_shifts[]; + extern const long nbpd[]; + pd_entry_t *dstpde; + size_t i, pidx, mod; + struct vm_page *pg; + paddr_t pa; + + dstpde = ci->ci_svs_updir; + mod = (size_t)-1; + + for (i = PTP_LEVELS; i > 1; i--) { + pidx = pl_i(va % mod, i); + + if (!pmap_valid_entry(dstpde[pidx])) { + pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); + if (pg == 0) + panic("%s: failed to allocate PA for CPU %d\n", + __func__, cpu_index(ci)); + pa = VM_PAGE_TO_PHYS(pg); + + dstpde[pidx] = PG_V | PG_RW | pa; + } + + pa = (paddr_t)(dstpde[pidx] & PG_FRAME); + dstpde = (pd_entry_t *)PMAP_DIRECT_MAP(pa); + mod = nbpd[i-1]; + } + + return dstpde; +} + +static void +svs_page_add(struct cpu_info *ci, vaddr_t va) +{ + pd_entry_t *srcpde, *dstpde, pde; + size_t idx, pidx; + paddr_t pa; + + /* Create levels L4, L3 and L2. */ + dstpde = svs_tree_add(ci, va); + + pidx = pl1_i(va % NBPD_L2); + + /* + * If 'va' is in a large page, we need to compute its physical + * address manually. + */ + idx = pl2_i(va); + srcpde = L2_BASE; + if (!pmap_valid_entry(srcpde[idx])) { + panic("%s: L2 page not mapped", __func__); + } + if (srcpde[idx] & PG_PS) { + pa = srcpde[idx] & PG_2MFRAME; + pa += (paddr_t)(va % NBPD_L2); + pde = (srcpde[idx] & ~(PG_G|PG_PS|PG_2MFRAME)) | pa; + + if (pmap_valid_entry(dstpde[pidx])) { + panic("%s: L1 page already mapped", __func__); + } + dstpde[pidx] = pde; + return; + } + + /* + * Normal page, just copy the PDE. + */ + idx = pl1_i(va); + srcpde = L1_BASE; + if (!pmap_valid_entry(srcpde[idx])) { + panic("%s: L1 page not mapped", __func__); + } + if (pmap_valid_entry(dstpde[pidx])) { + panic("%s: L1 page already mapped", __func__); + } + dstpde[pidx] = srcpde[idx] & ~(PG_G); +} + +static void +svs_rsp0_init(struct cpu_info *ci) +{ + const cpuid_t cid = cpu_index(ci); + vaddr_t va, rsp0; + pd_entry_t *pd; + size_t pidx; + + rsp0 = (vaddr_t)&pcpuarea->ent[cid].rsp0; + + /* The first page is a redzone. */ + va = rsp0 + PAGE_SIZE; + + /* Create levels L4, L3 and L2. */ + pd = svs_tree_add(ci, va); + + /* Get the info for L1. */ + pidx = pl1_i(va % NBPD_L2); + if (pmap_valid_entry(pd[pidx])) { + panic("%s: rsp0 page already mapped", __func__); + } + + ci->ci_svs_rsp0_pte = (pt_entry_t *)&pd[pidx]; + ci->ci_svs_rsp0 = rsp0 + PAGE_SIZE + sizeof(struct trapframe); + ci->ci_svs_ursp0 = ci->ci_svs_rsp0 - sizeof(struct trapframe); + ci->ci_svs_krsp0 = 0; +} + +static void +svs_utls_init(struct cpu_info *ci) +{ + const vaddr_t utlsva = (vaddr_t)&pcpuarea->utls; + struct svs_utls *utls; + struct vm_page *pg; + pd_entry_t *pd; + size_t pidx; + paddr_t pa; + vaddr_t va; + + /* Create levels L4, L3 and L2 of the UTLS page. */ + pd = svs_tree_add(ci, utlsva); + + /* Allocate L1. */ + pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); + if (pg == 0) + panic("%s: failed to allocate PA for CPU %d\n", __func__, + cpu_index(ci)); + pa = VM_PAGE_TO_PHYS(pg); + + /* Enter L1. */ + if (pmap_valid_entry(L1_BASE[pl1_i(utlsva)])) { + panic("%s: local page already mapped", __func__); + } + pidx = pl1_i(utlsva % NBPD_L2); + if (pmap_valid_entry(pd[pidx])) { + panic("%s: L1 page already mapped", __func__); + } + pd[pidx] = PG_V | PG_RW | pmap_pg_nx | pa; + + /* + * Now, allocate a VA in the kernel map, that points to the UTLS + * page. After that, the UTLS page will be accessible in kernel + * mode via ci_svs_utls. + */ + va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, + UVM_KMF_VAONLY|UVM_KMF_NOWAIT); + if (va == 0) { + panic("%s: unable to allocate VA\n", __func__); + } + pmap_kenter_pa(va, pa, VM_PROT_READ|VM_PROT_WRITE, 0); + pmap_update(pmap_kernel()); + + ci->ci_svs_utls = va; + + /* Initialize the constant fields of the UTLS page */ + utls = (struct svs_utls *)ci->ci_svs_utls; + utls->rsp0 = ci->ci_svs_rsp0; +} + +static void +svs_range_add(struct cpu_info *ci, vaddr_t va, size_t size) +{ + size_t i, n; + + KASSERT(size % PAGE_SIZE == 0); + n = size / PAGE_SIZE; + for (i = 0; i < n; i++) { + svs_page_add(ci, va + i * PAGE_SIZE); + } +} + +void +cpu_svs_init(struct cpu_info *ci) +{ + extern char __text_user_start; + extern char __text_user_end; + const cpuid_t cid = cpu_index(ci); + struct vm_page *pg; + + KASSERT(ci != NULL); + + pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); + if (pg == 0) + panic("%s: failed to allocate L4 PA for CPU %d\n", + __func__, cpu_index(ci)); + ci->ci_svs_updirpa = VM_PAGE_TO_PHYS(pg); + + ci->ci_svs_updir = (pt_entry_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0, + UVM_KMF_VAONLY | UVM_KMF_NOWAIT); + if (ci->ci_svs_updir == NULL) + panic("%s: failed to allocate L4 VA for CPU %d\n", + __func__, cpu_index(ci)); + + pmap_kenter_pa((vaddr_t)ci->ci_svs_updir, ci->ci_svs_updirpa, + VM_PROT_READ | VM_PROT_WRITE, 0); + + pmap_update(pmap_kernel()); + + ci->ci_svs_kpdirpa = pmap_pdirpa(pmap_kernel(), 0); + + mutex_init(&ci->ci_svs_mtx, MUTEX_DEFAULT, IPL_VM); + + svs_page_add(ci, (vaddr_t)&pcpuarea->idt); + svs_page_add(ci, (vaddr_t)&pcpuarea->ldt); + svs_range_add(ci, (vaddr_t)&pcpuarea->ent[cid], + offsetof(struct pcpu_entry, rsp0)); + svs_range_add(ci, (vaddr_t)&__text_user_start, + (vaddr_t)&__text_user_end - (vaddr_t)&__text_user_start); + + svs_rsp0_init(ci); + svs_utls_init(ci); +} + +void +svs_pmap_sync(struct pmap *pmap, int index) +{ + CPU_INFO_ITERATOR cii; + struct cpu_info *ci; + cpuid_t cid; + + KASSERT(pmap != NULL); + KASSERT(pmap != pmap_kernel()); + KASSERT(mutex_owned(pmap->pm_lock)); + KASSERT(kpreempt_disabled()); + KASSERT(index < 255); + + for (CPU_INFO_FOREACH(cii, ci)) { + cid = cpu_index(ci); + + if (!kcpuset_isset(pmap->pm_kernel_cpus, cid)) { + continue; + } + + /* take the lock and check again */ + mutex_enter(&ci->ci_svs_mtx); + if (kcpuset_isset(pmap->pm_kernel_cpus, cid)) { + ci->ci_svs_updir[index] = pmap->pm_pdir[index]; + } + mutex_exit(&ci->ci_svs_mtx); + } +} + +void +svs_lwp_switch(struct lwp *oldlwp, struct lwp *newlwp) +{ + struct cpu_info *ci = curcpu(); + struct svs_utls *utls; + struct pcb *pcb; + pt_entry_t *pte; + uintptr_t rsp0; + vaddr_t va; + + if (newlwp->l_flag & LW_SYSTEM) { + return; + } + +#ifdef DIAGNOSTIC + if (oldlwp != NULL && !(oldlwp->l_flag & LW_SYSTEM)) { + pcb = lwp_getpcb(oldlwp); + rsp0 = pcb->pcb_rsp0; + va = rounddown(rsp0, PAGE_SIZE); + KASSERT(ci->ci_svs_krsp0 == rsp0 - sizeof(struct trapframe)); + pte = ci->ci_svs_rsp0_pte; + KASSERT(*pte == L1_BASE[pl1_i(va)]); + } +#endif + + pcb = lwp_getpcb(newlwp); + rsp0 = pcb->pcb_rsp0; + va = rounddown(rsp0, PAGE_SIZE); + + /* Update the kernel rsp0 in cpu_info */ + ci->ci_svs_krsp0 = rsp0 - sizeof(struct trapframe); + KASSERT((ci->ci_svs_krsp0 % PAGE_SIZE) == + (ci->ci_svs_ursp0 % PAGE_SIZE)); + + utls = (struct svs_utls *)ci->ci_svs_utls; + utls->scratch = 0; + + /* + * Enter the user rsp0. We don't need to flush the TLB here, since + * the user page tables are not loaded. + */ + pte = ci->ci_svs_rsp0_pte; + *pte = L1_BASE[pl1_i(va)]; +} + +static inline pt_entry_t +svs_pte_atomic_read(struct pmap *pmap, size_t idx) +{ + /* + * XXX: We don't have a basic atomic_fetch_64 function? + */ + return atomic_cas_64(&pmap->pm_pdir[idx], 666, 666); +} + +/* + * We may come here with the pmap unlocked. So read its PTEs atomically. If + * a remote CPU is updating them at the same time, it's not a problem: the + * remote CPU will call svs_pmap_sync afterwards, and our updirpa will be + * synchronized properly. + */ +void +svs_pdir_switch(struct pmap *pmap) +{ + struct cpu_info *ci = curcpu(); + struct svs_utls *utls; + pt_entry_t pte; + size_t i; + + KASSERT(kpreempt_disabled()); + KASSERT(pmap != pmap_kernel()); + + ci->ci_svs_kpdirpa = pmap_pdirpa(pmap, 0); + + /* Update the info in the UTLS page */ + utls = (struct svs_utls *)ci->ci_svs_utls; + utls->kpdirpa = ci->ci_svs_kpdirpa; + + mutex_enter(&ci->ci_svs_mtx); + + /* User slots. */ + for (i = 0; i < 255; i++) { + pte = svs_pte_atomic_read(pmap, i); + ci->ci_svs_updir[i] = pte; + } + + mutex_exit(&ci->ci_svs_mtx); +} + +static void +svs_enable(void) +{ + extern uint8_t svs_enter, svs_enter_end; + extern uint8_t svs_enter_altstack, svs_enter_altstack_end; + extern uint8_t svs_leave, svs_leave_end; + extern uint8_t svs_leave_altstack, svs_leave_altstack_end; + u_long psl, cr0; + uint8_t *bytes; + size_t size; + + svs_enabled = true; + + x86_patch_window_open(&psl, &cr0); + + bytes = &svs_enter; + size = (size_t)&svs_enter_end - (size_t)&svs_enter; + x86_hotpatch(HP_NAME_SVS_ENTER, bytes, size); + + bytes = &svs_enter_altstack; + size = (size_t)&svs_enter_altstack_end - + (size_t)&svs_enter_altstack; + x86_hotpatch(HP_NAME_SVS_ENTER_ALT, bytes, size); + + bytes = &svs_leave; + size = (size_t)&svs_leave_end - (size_t)&svs_leave; + x86_hotpatch(HP_NAME_SVS_LEAVE, bytes, size); + + bytes = &svs_leave_altstack; + size = (size_t)&svs_leave_altstack_end - + (size_t)&svs_leave_altstack; + x86_hotpatch(HP_NAME_SVS_LEAVE_ALT, bytes, size); + + x86_patch_window_close(psl, cr0); +} + +static void +svs_disable_hotpatch(void) +{ + extern uint8_t nosvs_enter, nosvs_enter_end; + extern uint8_t nosvs_enter_altstack, nosvs_enter_altstack_end; + extern uint8_t nosvs_leave, nosvs_leave_end; + extern uint8_t nosvs_leave_altstack, nosvs_leave_altstack_end; + u_long psl, cr0; + uint8_t *bytes; + size_t size; + + x86_patch_window_open(&psl, &cr0); + + bytes = &nosvs_enter; + size = (size_t)&nosvs_enter_end - (size_t)&nosvs_enter; + x86_hotpatch(HP_NAME_SVS_ENTER, bytes, size); + + bytes = &nosvs_enter_altstack; + size = (size_t)&nosvs_enter_altstack_end - + (size_t)&nosvs_enter_altstack; + x86_hotpatch(HP_NAME_SVS_ENTER_ALT, bytes, size); + + bytes = &nosvs_leave; + size = (size_t)&nosvs_leave_end - (size_t)&nosvs_leave; + x86_hotpatch(HP_NAME_SVS_LEAVE, bytes, size); + + bytes = &nosvs_leave_altstack; + size = (size_t)&nosvs_leave_altstack_end - + (size_t)&nosvs_leave_altstack; + x86_hotpatch(HP_NAME_SVS_LEAVE_ALT, bytes, size); + + x86_patch_window_close(psl, cr0); +} + +static volatile unsigned long svs_cpu_barrier1 __cacheline_aligned; +static volatile unsigned long svs_cpu_barrier2 __cacheline_aligned; +typedef void (vector)(void); + +static void +svs_disable_cpu(void *arg1, void *arg2) +{ + struct cpu_info *ci = curcpu(); + extern vector Xsyscall; + u_long psl; + + psl = x86_read_psl(); + x86_disable_intr(); + + atomic_dec_ulong(&svs_cpu_barrier1); + while (atomic_cas_ulong(&svs_cpu_barrier1, 0, 0) != 0) { + x86_pause(); + } + + /* cpu0 is the one that does the hotpatch job */ + if (ci == &cpu_info_primary) { + svs_enabled = false; + svs_disable_hotpatch(); + } + + /* put back the non-SVS syscall entry point */ + wrmsr(MSR_LSTAR, (uint64_t)Xsyscall); + + /* enable global pages */ + if (cpu_feature[0] & CPUID_PGE) + lcr4(rcr4() | CR4_PGE); + + atomic_dec_ulong(&svs_cpu_barrier2); + while (atomic_cas_ulong(&svs_cpu_barrier2, 0, 0) != 0) { + x86_pause(); + } + + /* Write back and invalidate cache, flush pipelines. */ + wbinvd(); + x86_flush(); + + x86_write_psl(psl); +} + +static int +svs_disable(void) +{ + struct cpu_info *ci = NULL; + CPU_INFO_ITERATOR cii; + uint64_t xc; + + mutex_enter(&cpu_lock); + + /* + * We expect all the CPUs to be online. + */ + for (CPU_INFO_FOREACH(cii, ci)) { + struct schedstate_percpu *spc = &ci->ci_schedstate; + if (spc->spc_flags & SPCF_OFFLINE) { + printf("[!] cpu%d offline, SVS not disabled\n", + cpu_index(ci)); + mutex_exit(&cpu_lock); + return EOPNOTSUPP; + } + } + + svs_cpu_barrier1 = ncpu; + svs_cpu_barrier2 = ncpu; + + printf("[+] Disabling SVS..."); + xc = xc_broadcast(0, svs_disable_cpu, NULL, NULL); + xc_wait(xc); + printf(" done!\n"); + + mutex_exit(&cpu_lock); + + return 0; +} + +int sysctl_machdep_svs_enabled(SYSCTLFN_ARGS); + +int +sysctl_machdep_svs_enabled(SYSCTLFN_ARGS) +{ + struct sysctlnode node; + int error, val; + + val = *(int *)rnode->sysctl_data; + + node = *rnode; + node.sysctl_data = &val; + + error = sysctl_lookup(SYSCTLFN_CALL(&node)); + if (error != 0 || newp == NULL) + return error; + + if (val == 1) { + error = EINVAL; + } else { + if (svs_enabled) + error = svs_disable(); + else + error = 0; + } + + return error; +} + +void +svs_init(void) +{ + if (cpu_vendor != CPUVENDOR_INTEL) { + return; + } + svs_enable(); +}