Looks good. Just some nitpicks about the stack stuff and debugging stuff
below.

On Sun, Aug 28, 2016 at 4:01 PM, Benoit Canet <
[email protected]> wrote:

> Enable "fast system calls" via the 'syscall' instruction on OSv. The
> instruction is used by Go programs on Linux/x86-64 for system calls.
>
> Signed-off-by: Pekka Enberg <[email protected]>
> Signed-off-by: Benoît Canet <[email protected]>
> ---
>  arch/x64/arch-setup.cc | 25 +++++++++++++
>  arch/x64/entry.S       | 96 ++++++++++++++++++++++++++++++
> ++++++++++++++++++++
>  arch/x64/msr.hh        |  3 ++
>  linux.cc               | 13 +++++++
>  4 files changed, 137 insertions(+)
>
> diff --git a/arch/x64/arch-setup.cc b/arch/x64/arch-setup.cc
> index 5e76d82..0994de5 100644
> --- a/arch/x64/arch-setup.cc
> +++ b/arch/x64/arch-setup.cc
> @@ -6,10 +6,12 @@
>   */
>
>  #include "arch.hh"
> +#include "arch-cpu.hh"
>  #include "arch-setup.hh"
>  #include <osv/mempool.hh>
>  #include <osv/mmu.hh>
>  #include "processor.hh"
> +#include "processor-flags.h"
>  #include "msr.hh"
>  #include "xen.hh"
>  #include <osv/elf.hh>
> @@ -213,6 +215,28 @@ static inline void disable_pic()
>      XENPV_ALTERNATIVE({ processor::outb(0xff, 0x21);
> processor::outb(0xff, 0xa1); }, {});
>  }
>
> +extern "C" void syscall_entry(void);
> +
> +// SYSCALL Enable
> +static const int IA32_EFER_SCE = 0x1 << 0;
> +// Selector shift
> +static const int CS_SELECTOR_SHIFT = 3;
> +// syscall shift
> +static const int IA_32_STAR_SYSCALL_SHIFT = 32;
> +
> +static void setup_syscall()
> +{
> +    unsigned long cs = gdt_cs;
> +    processor::wrmsr(msr::IA32_STAR,  (cs << CS_SELECTOR_SHIFT) <<
> IA_32_STAR_SYSCALL_SHIFT);
> +    // lstar is where syscall set rip so we set it to syscall_entry
> +    processor::wrmsr(msr::IA32_LSTAR, reinterpret_cast<uint64_t>(
> syscall_entry));
> +    // syscall does rflag = rflag and not fmask
> +    // we want no minimize the impact of the syscall instruction so we
> choose
> +    // fmask as zero
> +    processor::wrmsr(msr::IA32_FMASK, 0);
> +    processor::wrmsr(msr::IA32_EFER,  processor::rdmsr(msr::IA32_EFER) |
> IA32_EFER_SCE);
> +}
> +
>  void arch_init_premain()
>  {
>      auto omb = *osv_multiboot_info;
> @@ -220,6 +244,7 @@ void arch_init_premain()
>         debug_early_u64("Error reading disk (real mode): ",
> static_cast<u64>(omb.disk_err));
>
>      disable_pic();
> +    setup_syscall();
>  }
>
>  #include "drivers/driver.hh"
> diff --git a/arch/x64/entry.S b/arch/x64/entry.S
> index b6f5abe..0526aa7 100644
> --- a/arch/x64/entry.S
> +++ b/arch/x64/entry.S
> @@ -159,3 +159,99 @@ call_signal_handler_thunk:
>          iretq
>          .cfi_endproc
>
> +.align 16
> +.global syscall_entry
> +syscall_entry:
> +        .type syscall_entry, @function
> +        .cfi_startproc simple
> +       # There is no ring transition and rflags are left unchanged.
> +
> +       #
> +       # From http://stackoverflow.com/questions/2535989/what-are-
> the-calling-conventions-for-unix-linux-system-calls-on-x86-64:
> +       # "User-level applications use as integer registers for passing
> the sequence %rdi, %rsi, %rdx, %rcx, %r8 and %r9. The kernel interface uses
> %rdi, %rsi, %rdx, %r10, %r8 and %r9"
> +
> +        # FIXME: fpu
> +       # build the stack frame by hand
>

What do you mean by "build the stack frame"? The code below doesn't prepare
parameters on the stack, because those are all passed in registers.
I guess by "stack frame" you mean saving the old value of these registers?

+       pushq %rsp
> +       subq $8, %rsp # rip was saved in rcx by the syscall instruction
>

Why leave a hole in the stack? What purpose does it serve? Here you need
tgo subq, later you addq, and it's all just redundant, isn't it?

And why save %rsp at all? After we do all the pops, we'll get back the
original %rsp anyway...


> +       pushq %rax
> +       pushq %rbx
> +       pushq %rcx # contains rip before syscall instruction
> +       pushq %rdx
> +       pushq %rsi
> +       pushq %rdi
> +       pushq %r8
> +       pushq %r9
> +       pushq %r10
> +       pushq %r11 # contains rflags before syscall instruction
>

Oh, I didn't know that. Good you found that.


> +       pushq %r12
> +       pushq %r13
> +       pushq %r14
> +       pushq %r15
> +
> +        # stack contains a signal_frame
> +        .cfi_signal_frame
>

Why signal_frame?

Can you please take a look at __elf_resolve_pltgot from elf-dl.S? It might
make sense to do something similar here. Maybe you can use the
pushq_cfi/popq_cfi tricks instead of using .cfi_offset directly?

But anyway, if all this stuff works for you in gdb, I guess you got at
least most of it right :-)

+        .cfi_def_cfa %rsp, 0
> +       .cfi_register rip,rcx # rcx took previous rip value
> +       .cfi_register rflags,r11 # r11 took previous rflags value
> +       .cfi_undefined rcx # was overwritten with rip by the syscall
> instruction
> +       .cfi_undefined r11 # was overwritten with rflags by the syscall
> instruction
> +        .cfi_offset %r15, 0x00
> +        .cfi_offset %r14, 0x08
> +        .cfi_offset %r13, 0x10
> +        .cfi_offset %r12, 0x18
> +        .cfi_offset %r11, 0x20
> +        .cfi_offset %r10, 0x28
> +        .cfi_offset %r9, 0x30
> +        .cfi_offset %r8, 0x38
> +        .cfi_offset %rbp, 0x40
> +        .cfi_offset %rdi, 0x48
> +        .cfi_offset %rsi, 0x50
> +        .cfi_offset %rdx, 0x58
> +        .cfi_offset %rcx, 0x60
> +        .cfi_offset %rbx, 0x68
> +        .cfi_offset %rax, 0x70
> +        .cfi_offset %rip, 0x80
> +        .cfi_offset %rsp, 0x98
> +
> +       # The kernel interface use r10 as fourth argument while the user
> interface use rcx
> +       # so overwrite rcx with r10
> +       movq %r10, %rcx
> +
> +       # prepare function call parameter: r9 is on the stack since it's
> the seventh param
> +       # because we shift existing params by one to make room for syscall
> number
> +       pushq %r9
> +       movq %r8, %r9
> +       movq %rcx, %r8
> +       movq %rdx, %rcx
> +       movq %rsi, %rdx
> +       movq %rdi, %rsi
> +       # syscall number from rax as first argument
> +       movq %rax, %rdi
> +
> +       callq syscall_wrapper
> +
> +       popq %r9
> +       # in Linux user and kernel return value are in rax so we have
> nothing to do for return values
> +
> +       popq %r15
> +       popq %r14
> +       popq %r13
> +       popq %r12
> +       popq %r11
> +       popq %r10
> +       popq %r9
> +       popq %r8
> +       popq %rdi
> +       popq %rsi
> +       popq %rdx
> +       popq %rcx
> +       popq %rbx
> +        addq $8, %rsp  # skip rax emplacement (return value is in rax)
> +        addq $8, %rsp  # rip emplacement (rip cannot be popped)
> +       popq %rsp
>

Again, I don't think popq %rsp is needed, as the previous pops got rsp pack
to the right place.


> +
> +       # jump to rcx where the syscall instruction put rip
> +       # (sysret would leave rxc cloberred so we have nothing to do to
> restore it)
> +       jmpq *%rcx
> +       .cfi_endproc
> diff --git a/arch/x64/msr.hh b/arch/x64/msr.hh
> index 154bba7..d77c75c 100644
> --- a/arch/x64/msr.hh
> +++ b/arch/x64/msr.hh
> @@ -58,6 +58,9 @@ enum class msr : uint32_t {
>
>      IA32_APIC_BASE = 0x0000001b,
>      IA32_EFER = 0xc0000080,
> +    IA32_STAR = 0xc0000081,
> +    IA32_LSTAR = 0xc0000082,
> +    IA32_FMASK = 0xc0000084,
>      IA32_FS_BASE = 0xc0000100,
>
>      KVM_WALL_CLOCK = 0x11,
> diff --git a/linux.cc b/linux.cc
> index bd82ca9..843d131 100644
> --- a/linux.cc
> +++ b/linux.cc
> @@ -291,3 +291,16 @@ long syscall(long number, ...)
>      return -1;
>  }
>  long __syscall(long number, ...)  __attribute__((alias("syscall")));
> +
> +extern "C" long syscall_wrapper(long number, ...)
> +{
> +    int errno_backup = errno;
> +    // syscall and function return value are in rax
> +    auto ret = syscall(number);
> +    int result = -errno;
> +    errno = errno_backup;
> +    if (ret < 0 && ret >= -4096) {
> +       return result;
> +    }
> +    return ret;
> +}
> --
> 2.7.4
>
> --
> You received this message because you are subscribed to the Google Groups
> "OSv Development" group.
> To unsubscribe from this group and stop receiving emails from it, send an
> email to [email protected].
> For more options, visit https://groups.google.com/d/optout.
>

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.

Reply via email to