Enable "fast system calls" via the 'syscall' instruction on OSv. The instruction is used by Go programs on Linux/x86-64 for system calls.
Signed-off-by: Pekka Enberg <[email protected]> Signed-off-by: BenoƮt Canet <[email protected]> --- arch/x64/arch-setup.cc | 25 ++++++++++++++ arch/x64/entry.S | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++ arch/x64/msr.hh | 3 ++ linux.cc | 13 ++++++++ 4 files changed, 131 insertions(+) diff --git a/arch/x64/arch-setup.cc b/arch/x64/arch-setup.cc index 5e76d82..0994de5 100644 --- a/arch/x64/arch-setup.cc +++ b/arch/x64/arch-setup.cc @@ -6,10 +6,12 @@ */ #include "arch.hh" +#include "arch-cpu.hh" #include "arch-setup.hh" #include <osv/mempool.hh> #include <osv/mmu.hh> #include "processor.hh" +#include "processor-flags.h" #include "msr.hh" #include "xen.hh" #include <osv/elf.hh> @@ -213,6 +215,28 @@ static inline void disable_pic() XENPV_ALTERNATIVE({ processor::outb(0xff, 0x21); processor::outb(0xff, 0xa1); }, {}); } +extern "C" void syscall_entry(void); + +// SYSCALL Enable +static const int IA32_EFER_SCE = 0x1 << 0; +// Selector shift +static const int CS_SELECTOR_SHIFT = 3; +// syscall shift +static const int IA_32_STAR_SYSCALL_SHIFT = 32; + +static void setup_syscall() +{ + unsigned long cs = gdt_cs; + processor::wrmsr(msr::IA32_STAR, (cs << CS_SELECTOR_SHIFT) << IA_32_STAR_SYSCALL_SHIFT); + // lstar is where syscall set rip so we set it to syscall_entry + processor::wrmsr(msr::IA32_LSTAR, reinterpret_cast<uint64_t>(syscall_entry)); + // syscall does rflag = rflag and not fmask + // we want no minimize the impact of the syscall instruction so we choose + // fmask as zero + processor::wrmsr(msr::IA32_FMASK, 0); + processor::wrmsr(msr::IA32_EFER, processor::rdmsr(msr::IA32_EFER) | IA32_EFER_SCE); +} + void arch_init_premain() { auto omb = *osv_multiboot_info; @@ -220,6 +244,7 @@ void arch_init_premain() debug_early_u64("Error reading disk (real mode): ", static_cast<u64>(omb.disk_err)); disable_pic(); + setup_syscall(); } #include "drivers/driver.hh" diff --git a/arch/x64/entry.S b/arch/x64/entry.S index b6f5abe..b894f77 100644 --- a/arch/x64/entry.S +++ b/arch/x64/entry.S @@ -159,3 +159,93 @@ call_signal_handler_thunk: iretq .cfi_endproc +.align 16 +.global syscall_entry +syscall_entry: + .type syscall_entry, @function + .cfi_startproc simple + .cfi_signal_frame + .cfi_def_cfa %rsp, 0 + .cfi_register rip,rcx + # stack contains a signal_frame + .cfi_offset %r15, 0x00 + .cfi_offset %r14, 0x08 + .cfi_offset %r13, 0x10 + .cfi_offset %r12, 0x18 + .cfi_offset %r11, 0x20 + .cfi_offset %r10, 0x28 + .cfi_offset %r9, 0x30 + .cfi_offset %r8, 0x38 + .cfi_offset %rbp, 0x40 + .cfi_offset %rdi, 0x48 + .cfi_offset %rsi, 0x50 + .cfi_offset %rdx, 0x58 + .cfi_offset %rcx, 0x60 + .cfi_offset %rbx, 0x68 + .cfi_offset %rax, 0x70 + .cfi_offset %rip, 0x80 + .cfi_offset %rsp, 0x98 + + # There is no ring transition and rflags are left unchanged. + + # + # From http://stackoverflow.com/questions/2535989/what-are-the-calling-conventions-for-unix-linux-system-calls-on-x86-64: + # "User-level applications use as integer registers for passing the sequence %rdi, %rsi, %rdx, %rcx, %r8 and %r9. The kernel interface uses %rdi, %rsi, %rdx, %r10, %r8 and %r9" + + pushq_cfi %rsp + pushq_cfi %rcx # rip backup + pushq_cfi %rax + pushq_cfi %rbx + pushq_cfi %rcx + pushq_cfi %rdx + pushq_cfi %rsi + pushq_cfi %rdi + pushq_cfi %r8 + pushq_cfi %r9 + pushq_cfi %r10 + pushq_cfi %r11 + pushq_cfi %r12 + pushq_cfi %r13 + pushq_cfi %r14 + pushq_cfi %r15 + + # The kernel interface use r10 as fourth argument while the user interface use rcx + # so overwrite rcx with r10 + movq %r10, %rcx + + # prepare function call parameter (r9 on the stack since it's the seventh param) + # we shift existing params by one to make room for syscall number + pushq %r9 + movq %r8, %r9 + movq %rcx, %r8 + movq %rdx, %rcx + movq %rsi, %rdx + movq %rdi, %rsi + # syscall number from rax as first argument + movq %rax, %rdi + + callq syscall_wrapper + + # in Linux user and kernel return value are in rax so we have nothing to do for return values + + popq_cfi %r15 + popq_cfi %r14 + popq_cfi %r13 + popq_cfi %r12 + popq_cfi %r11 + popq_cfi %r10 + popq_cfi %r9 + popq_cfi %r8 + popq_cfi %rdi + popq_cfi %rsi + popq_cfi %rdx + popq_cfi %rcx + popq_cfi %rbx + add $8, %rsp # rax emplacement + add $8, %rsp # rip emplacement + popq_cfi %rsp + + # jump to rcx where the syscall instruction put rip + # (sysret would leave rxc cloberred so we have nothing to do to restore it) + jmpq *%rcx + .cfi_endproc diff --git a/arch/x64/msr.hh b/arch/x64/msr.hh index 154bba7..d77c75c 100644 --- a/arch/x64/msr.hh +++ b/arch/x64/msr.hh @@ -58,6 +58,9 @@ enum class msr : uint32_t { IA32_APIC_BASE = 0x0000001b, IA32_EFER = 0xc0000080, + IA32_STAR = 0xc0000081, + IA32_LSTAR = 0xc0000082, + IA32_FMASK = 0xc0000084, IA32_FS_BASE = 0xc0000100, KVM_WALL_CLOCK = 0x11, diff --git a/linux.cc b/linux.cc index bd82ca9..424b443 100644 --- a/linux.cc +++ b/linux.cc @@ -291,3 +291,16 @@ long syscall(long number, ...) return -1; } long __syscall(long number, ...) __attribute__((alias("syscall"))); + +extern "C" long syscall_wrapper(long number, ...) +{ + int errno_backup = errno; + // syscall and function return value are in rax + auto ret = syscall(number); + int result = -errno; + errno = errno_backup; + if (ret < 0) { + return result; + } + return 0; +} -- 2.7.4 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. For more options, visit https://groups.google.com/d/optout.
