Looks good. Just some nitpicks about the stack stuff and debugging stuff below.
On Sun, Aug 28, 2016 at 4:01 PM, Benoit Canet < [email protected]> wrote: > Enable "fast system calls" via the 'syscall' instruction on OSv. The > instruction is used by Go programs on Linux/x86-64 for system calls. > > Signed-off-by: Pekka Enberg <[email protected]> > Signed-off-by: Benoît Canet <[email protected]> > --- > arch/x64/arch-setup.cc | 25 +++++++++++++ > arch/x64/entry.S | 96 ++++++++++++++++++++++++++++++ > ++++++++++++++++++++ > arch/x64/msr.hh | 3 ++ > linux.cc | 13 +++++++ > 4 files changed, 137 insertions(+) > > diff --git a/arch/x64/arch-setup.cc b/arch/x64/arch-setup.cc > index 5e76d82..0994de5 100644 > --- a/arch/x64/arch-setup.cc > +++ b/arch/x64/arch-setup.cc > @@ -6,10 +6,12 @@ > */ > > #include "arch.hh" > +#include "arch-cpu.hh" > #include "arch-setup.hh" > #include <osv/mempool.hh> > #include <osv/mmu.hh> > #include "processor.hh" > +#include "processor-flags.h" > #include "msr.hh" > #include "xen.hh" > #include <osv/elf.hh> > @@ -213,6 +215,28 @@ static inline void disable_pic() > XENPV_ALTERNATIVE({ processor::outb(0xff, 0x21); > processor::outb(0xff, 0xa1); }, {}); > } > > +extern "C" void syscall_entry(void); > + > +// SYSCALL Enable > +static const int IA32_EFER_SCE = 0x1 << 0; > +// Selector shift > +static const int CS_SELECTOR_SHIFT = 3; > +// syscall shift > +static const int IA_32_STAR_SYSCALL_SHIFT = 32; > + > +static void setup_syscall() > +{ > + unsigned long cs = gdt_cs; > + processor::wrmsr(msr::IA32_STAR, (cs << CS_SELECTOR_SHIFT) << > IA_32_STAR_SYSCALL_SHIFT); > + // lstar is where syscall set rip so we set it to syscall_entry > + processor::wrmsr(msr::IA32_LSTAR, reinterpret_cast<uint64_t>( > syscall_entry)); > + // syscall does rflag = rflag and not fmask > + // we want no minimize the impact of the syscall instruction so we > choose > + // fmask as zero > + processor::wrmsr(msr::IA32_FMASK, 0); > + processor::wrmsr(msr::IA32_EFER, processor::rdmsr(msr::IA32_EFER) | > IA32_EFER_SCE); > +} > + > void arch_init_premain() > { > auto omb = *osv_multiboot_info; > @@ -220,6 +244,7 @@ void arch_init_premain() > debug_early_u64("Error reading disk (real mode): ", > static_cast<u64>(omb.disk_err)); > > disable_pic(); > + setup_syscall(); > } > > #include "drivers/driver.hh" > diff --git a/arch/x64/entry.S b/arch/x64/entry.S > index b6f5abe..0526aa7 100644 > --- a/arch/x64/entry.S > +++ b/arch/x64/entry.S > @@ -159,3 +159,99 @@ call_signal_handler_thunk: > iretq > .cfi_endproc > > +.align 16 > +.global syscall_entry > +syscall_entry: > + .type syscall_entry, @function > + .cfi_startproc simple > + # There is no ring transition and rflags are left unchanged. > + > + # > + # From http://stackoverflow.com/questions/2535989/what-are- > the-calling-conventions-for-unix-linux-system-calls-on-x86-64: > + # "User-level applications use as integer registers for passing > the sequence %rdi, %rsi, %rdx, %rcx, %r8 and %r9. The kernel interface uses > %rdi, %rsi, %rdx, %r10, %r8 and %r9" > + > + # FIXME: fpu > + # build the stack frame by hand > What do you mean by "build the stack frame"? The code below doesn't prepare parameters on the stack, because those are all passed in registers. I guess by "stack frame" you mean saving the old value of these registers? + pushq %rsp > + subq $8, %rsp # rip was saved in rcx by the syscall instruction > Why leave a hole in the stack? What purpose does it serve? Here you need tgo subq, later you addq, and it's all just redundant, isn't it? And why save %rsp at all? After we do all the pops, we'll get back the original %rsp anyway... > + pushq %rax > + pushq %rbx > + pushq %rcx # contains rip before syscall instruction > + pushq %rdx > + pushq %rsi > + pushq %rdi > + pushq %r8 > + pushq %r9 > + pushq %r10 > + pushq %r11 # contains rflags before syscall instruction > Oh, I didn't know that. Good you found that. > + pushq %r12 > + pushq %r13 > + pushq %r14 > + pushq %r15 > + > + # stack contains a signal_frame > + .cfi_signal_frame > Why signal_frame? Can you please take a look at __elf_resolve_pltgot from elf-dl.S? It might make sense to do something similar here. Maybe you can use the pushq_cfi/popq_cfi tricks instead of using .cfi_offset directly? But anyway, if all this stuff works for you in gdb, I guess you got at least most of it right :-) + .cfi_def_cfa %rsp, 0 > + .cfi_register rip,rcx # rcx took previous rip value > + .cfi_register rflags,r11 # r11 took previous rflags value > + .cfi_undefined rcx # was overwritten with rip by the syscall > instruction > + .cfi_undefined r11 # was overwritten with rflags by the syscall > instruction > + .cfi_offset %r15, 0x00 > + .cfi_offset %r14, 0x08 > + .cfi_offset %r13, 0x10 > + .cfi_offset %r12, 0x18 > + .cfi_offset %r11, 0x20 > + .cfi_offset %r10, 0x28 > + .cfi_offset %r9, 0x30 > + .cfi_offset %r8, 0x38 > + .cfi_offset %rbp, 0x40 > + .cfi_offset %rdi, 0x48 > + .cfi_offset %rsi, 0x50 > + .cfi_offset %rdx, 0x58 > + .cfi_offset %rcx, 0x60 > + .cfi_offset %rbx, 0x68 > + .cfi_offset %rax, 0x70 > + .cfi_offset %rip, 0x80 > + .cfi_offset %rsp, 0x98 > + > + # The kernel interface use r10 as fourth argument while the user > interface use rcx > + # so overwrite rcx with r10 > + movq %r10, %rcx > + > + # prepare function call parameter: r9 is on the stack since it's > the seventh param > + # because we shift existing params by one to make room for syscall > number > + pushq %r9 > + movq %r8, %r9 > + movq %rcx, %r8 > + movq %rdx, %rcx > + movq %rsi, %rdx > + movq %rdi, %rsi > + # syscall number from rax as first argument > + movq %rax, %rdi > + > + callq syscall_wrapper > + > + popq %r9 > + # in Linux user and kernel return value are in rax so we have > nothing to do for return values > + > + popq %r15 > + popq %r14 > + popq %r13 > + popq %r12 > + popq %r11 > + popq %r10 > + popq %r9 > + popq %r8 > + popq %rdi > + popq %rsi > + popq %rdx > + popq %rcx > + popq %rbx > + addq $8, %rsp # skip rax emplacement (return value is in rax) > + addq $8, %rsp # rip emplacement (rip cannot be popped) > + popq %rsp > Again, I don't think popq %rsp is needed, as the previous pops got rsp pack to the right place. > + > + # jump to rcx where the syscall instruction put rip > + # (sysret would leave rxc cloberred so we have nothing to do to > restore it) > + jmpq *%rcx > + .cfi_endproc > diff --git a/arch/x64/msr.hh b/arch/x64/msr.hh > index 154bba7..d77c75c 100644 > --- a/arch/x64/msr.hh > +++ b/arch/x64/msr.hh > @@ -58,6 +58,9 @@ enum class msr : uint32_t { > > IA32_APIC_BASE = 0x0000001b, > IA32_EFER = 0xc0000080, > + IA32_STAR = 0xc0000081, > + IA32_LSTAR = 0xc0000082, > + IA32_FMASK = 0xc0000084, > IA32_FS_BASE = 0xc0000100, > > KVM_WALL_CLOCK = 0x11, > diff --git a/linux.cc b/linux.cc > index bd82ca9..843d131 100644 > --- a/linux.cc > +++ b/linux.cc > @@ -291,3 +291,16 @@ long syscall(long number, ...) > return -1; > } > long __syscall(long number, ...) __attribute__((alias("syscall"))); > + > +extern "C" long syscall_wrapper(long number, ...) > +{ > + int errno_backup = errno; > + // syscall and function return value are in rax > + auto ret = syscall(number); > + int result = -errno; > + errno = errno_backup; > + if (ret < 0 && ret >= -4096) { > + return result; > + } > + return ret; > +} > -- > 2.7.4 > > -- > You received this message because you are subscribed to the Google Groups > "OSv Development" group. > To unsubscribe from this group and stop receiving emails from it, send an > email to [email protected]. > For more options, visit https://groups.google.com/d/optout. > -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. For more options, visit https://groups.google.com/d/optout.
