Enable "fast system calls" via the 'syscall' instruction on OSv. The
instruction is used by Go programs on Linux/x86-64 for system calls.

Signed-off-by: Pekka Enberg <[email protected]>
Signed-off-by: BenoĆ®t Canet <[email protected]>
---
 arch/x64/arch-setup.cc | 25 ++++++++++++++
 arch/x64/entry.S       | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++
 arch/x64/msr.hh        |  3 ++
 linux.cc               | 13 ++++++++
 4 files changed, 131 insertions(+)

diff --git a/arch/x64/arch-setup.cc b/arch/x64/arch-setup.cc
index 5e76d82..0994de5 100644
--- a/arch/x64/arch-setup.cc
+++ b/arch/x64/arch-setup.cc
@@ -6,10 +6,12 @@
  */
 
 #include "arch.hh"
+#include "arch-cpu.hh"
 #include "arch-setup.hh"
 #include <osv/mempool.hh>
 #include <osv/mmu.hh>
 #include "processor.hh"
+#include "processor-flags.h"
 #include "msr.hh"
 #include "xen.hh"
 #include <osv/elf.hh>
@@ -213,6 +215,28 @@ static inline void disable_pic()
     XENPV_ALTERNATIVE({ processor::outb(0xff, 0x21); processor::outb(0xff, 
0xa1); }, {});
 }
 
+extern "C" void syscall_entry(void);
+
+// SYSCALL Enable
+static const int IA32_EFER_SCE = 0x1 << 0;
+// Selector shift
+static const int CS_SELECTOR_SHIFT = 3;
+// syscall shift
+static const int IA_32_STAR_SYSCALL_SHIFT = 32;
+
+static void setup_syscall()
+{
+    unsigned long cs = gdt_cs;
+    processor::wrmsr(msr::IA32_STAR,  (cs << CS_SELECTOR_SHIFT) << 
IA_32_STAR_SYSCALL_SHIFT);
+    // lstar is where syscall set rip so we set it to syscall_entry
+    processor::wrmsr(msr::IA32_LSTAR, 
reinterpret_cast<uint64_t>(syscall_entry));
+    // syscall does rflag = rflag and not fmask
+    // we want no minimize the impact of the syscall instruction so we choose
+    // fmask as zero
+    processor::wrmsr(msr::IA32_FMASK, 0);
+    processor::wrmsr(msr::IA32_EFER,  processor::rdmsr(msr::IA32_EFER) | 
IA32_EFER_SCE);
+}
+
 void arch_init_premain()
 {
     auto omb = *osv_multiboot_info;
@@ -220,6 +244,7 @@ void arch_init_premain()
        debug_early_u64("Error reading disk (real mode): ", 
static_cast<u64>(omb.disk_err));
 
     disable_pic();
+    setup_syscall();
 }
 
 #include "drivers/driver.hh"
diff --git a/arch/x64/entry.S b/arch/x64/entry.S
index b6f5abe..b894f77 100644
--- a/arch/x64/entry.S
+++ b/arch/x64/entry.S
@@ -159,3 +159,93 @@ call_signal_handler_thunk:
         iretq
         .cfi_endproc
 
+.align 16
+.global syscall_entry
+syscall_entry:
+        .type syscall_entry, @function
+        .cfi_startproc simple
+        .cfi_signal_frame
+        .cfi_def_cfa %rsp, 0
+       .cfi_register rip,rcx
+        # stack contains a signal_frame
+        .cfi_offset %r15, 0x00
+        .cfi_offset %r14, 0x08
+        .cfi_offset %r13, 0x10
+        .cfi_offset %r12, 0x18
+        .cfi_offset %r11, 0x20
+        .cfi_offset %r10, 0x28
+        .cfi_offset %r9, 0x30
+        .cfi_offset %r8, 0x38
+        .cfi_offset %rbp, 0x40
+        .cfi_offset %rdi, 0x48
+        .cfi_offset %rsi, 0x50
+        .cfi_offset %rdx, 0x58
+        .cfi_offset %rcx, 0x60
+        .cfi_offset %rbx, 0x68
+        .cfi_offset %rax, 0x70
+        .cfi_offset %rip, 0x80
+        .cfi_offset %rsp, 0x98
+
+       # There is no ring transition and rflags are left unchanged.
+
+       #
+       # From 
http://stackoverflow.com/questions/2535989/what-are-the-calling-conventions-for-unix-linux-system-calls-on-x86-64:
+       # "User-level applications use as integer registers for passing the 
sequence %rdi, %rsi, %rdx, %rcx, %r8 and %r9. The kernel interface uses %rdi, 
%rsi, %rdx, %r10, %r8 and %r9"
+
+       pushq_cfi %rsp
+       pushq_cfi %rcx # rip backup
+       pushq_cfi %rax
+       pushq_cfi %rbx
+       pushq_cfi %rcx
+       pushq_cfi %rdx
+       pushq_cfi %rsi
+       pushq_cfi %rdi
+       pushq_cfi %r8
+       pushq_cfi %r9
+       pushq_cfi %r10
+       pushq_cfi %r11
+       pushq_cfi %r12
+       pushq_cfi %r13
+       pushq_cfi %r14
+       pushq_cfi %r15
+
+       # The kernel interface use r10 as fourth argument while the user 
interface use rcx
+       # so overwrite rcx with r10
+       movq %r10, %rcx
+
+       # prepare function call parameter (r9 on the stack since it's the 
seventh param)
+       # we shift existing params by one to make room for syscall number
+       pushq %r9
+       movq %r8, %r9
+       movq %rcx, %r8
+       movq %rdx, %rcx
+       movq %rsi, %rdx
+       movq %rdi, %rsi
+       # syscall number from rax as first argument
+       movq %rax, %rdi
+
+       callq syscall_wrapper
+
+       # in Linux user and kernel return value are in rax so we have nothing 
to do for return values
+
+       popq_cfi %r15
+       popq_cfi %r14
+       popq_cfi %r13
+       popq_cfi %r12
+       popq_cfi %r11
+       popq_cfi %r10
+       popq_cfi %r9
+       popq_cfi %r8
+       popq_cfi %rdi
+       popq_cfi %rsi
+       popq_cfi %rdx
+       popq_cfi %rcx
+       popq_cfi %rbx
+        add $8, %rsp  # rax emplacement
+        add $8, %rsp  # rip emplacement
+       popq_cfi %rsp
+
+       # jump to rcx where the syscall instruction put rip
+       # (sysret would leave rxc cloberred so we have nothing to do to restore 
it)
+       jmpq *%rcx
+       .cfi_endproc
diff --git a/arch/x64/msr.hh b/arch/x64/msr.hh
index 154bba7..d77c75c 100644
--- a/arch/x64/msr.hh
+++ b/arch/x64/msr.hh
@@ -58,6 +58,9 @@ enum class msr : uint32_t {
 
     IA32_APIC_BASE = 0x0000001b,
     IA32_EFER = 0xc0000080,
+    IA32_STAR = 0xc0000081,
+    IA32_LSTAR = 0xc0000082,
+    IA32_FMASK = 0xc0000084,
     IA32_FS_BASE = 0xc0000100,
 
     KVM_WALL_CLOCK = 0x11,
diff --git a/linux.cc b/linux.cc
index bd82ca9..424b443 100644
--- a/linux.cc
+++ b/linux.cc
@@ -291,3 +291,16 @@ long syscall(long number, ...)
     return -1;
 }
 long __syscall(long number, ...)  __attribute__((alias("syscall")));
+
+extern "C" long syscall_wrapper(long number, ...)
+{
+    int errno_backup = errno;
+    // syscall and function return value are in rax
+    auto ret = syscall(number);
+    int result = -errno;
+    errno = errno_backup;
+    if (ret < 0) {
+       return result;
+    }
+    return 0;
+}
-- 
2.7.4

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.

Reply via email to