Re: [PATCH v3] Implement switch to separate SYSCALL call stack

Waldek Kozaczuk Sat, 07 Apr 2018 21:06:07 -0700

This is the patch makes golang httpserver work.

On Thursday, March 15, 2018 at 10:56:26 PM UTC-4, Waldek Kozaczuk wrote:
>
> This patch implements separate syscall call stack needed 
> when runtimes like Golang use SYSCALL instruction to execute 
> system calls. More specifically in case of Golang tiny stacks 
> used by coroutines are not deep enough to execute system 
> calls on OSv which handles them as regular function calls. 
> In case of OS like Linux tiny Golang stacks are not a problem as 
> the system calls are executed on separate kernel stack. 
>
> More specifically each application thread pre-alllocates 
> "tiny" (1024-bytes deep) syscall call stack. When SYSCALL 
> instruction is called first time for given thread the call stack 
> is switched to the tiny syscall stack in order to setup the "large" 
> stack and switch to. Eventually execution of syscall continues 
> on large syscall stack. From this point, second and subsequent 
> executions of SYSCALL instruction are handled on large syscall stack. 
>
> Please note that because the stack is switched from original 
> caller stack to the syscall stack that is allocated lazily 
> the stack pointers "jumps" to higher addresses. From gdb 
> perspective this results in "Backtrace stopped: previous 
> frame inner to this frame (corrupt stack?)" message. 
> Gdb assumes that stack pointer should decrease in value as the stack 
> "grows" 
> which is not the case when stack is switched to syscall stack. 
>
> This patch is based on the original patch provided by @Hawxchen. 
>
> Fixes #808 
>
> --- 
>  arch/x64/arch-switch.hh | 108 
> ++++++++++++++++++++++++++++++++++++++++++++++++ 
>  arch/x64/arch-tls.hh    |  15 +++++++ 
>  arch/x64/entry.S        | 102 
> ++++++++++++++++++++++++++++++++++----------- 
>  include/osv/sched.hh    |   4 +- 
>  4 files changed, 205 insertions(+), 24 deletions(-) 
>
> diff --git a/arch/x64/arch-switch.hh b/arch/x64/arch-switch.hh 
> index d1a039a..b7635d2 100644 
> --- a/arch/x64/arch-switch.hh 
> +++ b/arch/x64/arch-switch.hh 
> @@ -12,6 +12,38 @@ 
>  #include <osv/barrier.hh> 
>  #include <string.h> 
>   
> +// 
> +// The last 16 bytes of the syscall stack are reserved for 2 fields - 
> +// caller stack pointer and tiny/large indicator. 
> +// The reserved space has to be a multiple of 16 bytes to make it 16 
> bytes 
> +// aligned as Linux x64 ABI mandates. 
> +#define SYSCALL_STACK_RESERVED_SPACE_SIZE (2 * 8) 
> +// 
> +// The tiny stack has to be large enough to allow for execution of 
> +// thread::setup_large_syscall_stack() that allocates and sets up 
> +// large syscall stack. It was measured that as of this writing 
> +// setup_large_syscall_stack() needs a little over 600 bytes of stack 
> +// to properly operate. This makes 1024 bytes to be an adequate size 
> +// of tiny stack. 
> +// All application threads pre-allocate tiny syscall stack so there 
> +// is a tiny penalty with this solution. 
> +#define TINY_SYSCALL_STACK_SIZE 1024 
> +#define TINY_SYSCALL_STACK_DEPTH (TINY_SYSCALL_STACK_SIZE - 
> SYSCALL_STACK_RESERVED_SPACE_SIZE) 
> +// 
> +// The large syscall stack is setup and switched to on first 
> +// execution of SYSCALL instruction for given application thread. 
> +#define LARGE_SYSCALL_STACK_SIZE (16 * PAGE_SIZE) 
> +#define LARGE_SYSCALL_STACK_DEPTH (LARGE_SYSCALL_STACK_SIZE - 
> SYSCALL_STACK_RESERVED_SPACE_SIZE) 
> + 
> +#define SET_SYSCALL_STACK_TYPE_INDICATOR(value) \ 
> +*((long*)(_tcb->syscall_stack_top + 8)) = value; 
> + 
> +#define GET_SYSCALL_STACK_TYPE_INDICATOR() \ 
> +*((long*)(_tcb->syscall_stack_top + 8)) 
> + 
> +#define TINY_SYSCALL_STACK_INDICATOR 0l 
> +#define LARGE_SYSCALL_STACK_INDICATOR 1l 
> + 
>  extern "C" { 
>  void thread_main(void); 
>  void thread_main_c(sched::thread* t); 
> @@ -146,6 +178,65 @@ void thread::setup_tcb() 
>      _tcb = static_cast<thread_control_block*>(p + tls.size + 
> user_tls_size); 
>      _tcb->self = _tcb; 
>      _tcb->tls_base = p + user_tls_size; 
> + 
> +    if (is_app()) { 
> +        // 
> +        // Allocate TINY syscall call stack 
> +        void* tiny_syscall_stack_begin = malloc(TINY_SYSCALL_STACK_SIZE); 
> +        assert(tiny_syscall_stack_begin); 
> +        // 
> +        // The top of the stack needs to be 16 bytes lower to make space 
> for 
> +        // two fields - SYSCALL caller stack pointer and OSv syscall 
> stack type indicator 
> +        _tcb->syscall_stack_top = tiny_syscall_stack_begin + 
> TINY_SYSCALL_STACK_DEPTH; 
> +        SET_SYSCALL_STACK_TYPE_INDICATOR(TINY_SYSCALL_STACK_INDICATOR); 
> +    } 
> +    else { 
> +        _tcb->syscall_stack_top = 0; 
> +    } 
> +} 
> + 
> +void thread::setup_large_syscall_stack() 
> +{ 
> +    assert(is_app()); 
> +    assert(GET_SYSCALL_STACK_TYPE_INDICATOR() == 
> TINY_SYSCALL_STACK_INDICATOR); 
> +    // 
> +    // Allocate LARGE syscall stack 
> +    void* large_syscall_stack_begin = malloc(LARGE_SYSCALL_STACK_SIZE); 
> +    void* large_syscall_stack_top = large_syscall_stack_begin + 
> LARGE_SYSCALL_STACK_DEPTH; 
> +    // 
> +    // Copy all of the tiny stack to the are of last 1024 bytes of large 
> stack. 
> +    // This way we do not have to pop and push the same registers to be 
> saved again. 
> +    // Also the caller stack pointer is also copied. 
> +    // We could have copied only last 128 (registers) + 16 bytes (2 
> fields) instead 
> +    // of all of the stack but copying 1024 is simpler and happens 
> +    // only once per thread. 
> +    void* tiny_syscall_stack_top = _tcb->syscall_stack_top; 
> +    memcpy(large_syscall_stack_top - TINY_SYSCALL_STACK_DEPTH, 
> tiny_syscall_stack_top - TINY_SYSCALL_STACK_DEPTH, 
> TINY_SYSCALL_STACK_SIZE); 
> +    // 
> +    // Save beginning of tiny stack at the bottom of LARGE stack so 
> +    // that we can deallocate it in free_tiny_syscall_stack 
> +    *((void**)large_syscall_stack_begin) = tiny_syscall_stack_top - 
> TINY_SYSCALL_STACK_DEPTH; 
> +    // 
> +    // Set canary value (0xDEADBEAFDEADBEAF) under bottom + 8 of LARGE 
> stack 
> +    *((long*)(large_syscall_stack_begin + 8)) = 0xdeadbeafdeadbeaf; 
> +    // 
> +    // Switch syscall stack address value in TCB to the top of the LARGE 
> one 
> +    _tcb->syscall_stack_top = large_syscall_stack_top; 
> +    SET_SYSCALL_STACK_TYPE_INDICATOR(LARGE_SYSCALL_STACK_INDICATOR); 
> +} 
> + 
> +void thread::free_tiny_syscall_stack() 
> +{ 
> +    assert(is_app()); 
> +    assert(GET_SYSCALL_STACK_TYPE_INDICATOR() == 
> LARGE_SYSCALL_STACK_INDICATOR); 
> + 
> +    void* large_syscall_stack_top = _tcb->syscall_stack_top; 
> +    void* large_syscall_stack_begin = large_syscall_stack_top - 
> LARGE_SYSCALL_STACK_DEPTH; 
> +    // 
> +    // Lookup address of tiny stack saved by setup_large_syscall_stack() 
> +    // at the bottom of LARGE stack (current syscall stack) 
> +    void* tiny_syscall_stack_begin = 
> *((void**)large_syscall_stack_begin); 
> +    free(tiny_syscall_stack_begin); 
>  } 
>   
>  void thread::free_tcb() 
> @@ -156,6 +247,13 @@ void thread::free_tcb() 
>      } else { 
>          free(_tcb->tls_base); 
>      } 
> + 
> +    if (_tcb->syscall_stack_top) { 
> +        void* syscall_stack_begin = GET_SYSCALL_STACK_TYPE_INDICATOR() == 
> TINY_SYSCALL_STACK_INDICATOR ? 
> +            _tcb->syscall_stack_top - TINY_SYSCALL_STACK_DEPTH : 
> +            _tcb->syscall_stack_top - LARGE_SYSCALL_STACK_DEPTH; 
> +        free(syscall_stack_begin); 
> +    } 
>  } 
>   
>  void thread_main_c(thread* t) 
> @@ -171,6 +269,16 @@ void thread_main_c(thread* t) 
>      t->complete(); 
>  } 
>   
> +extern "C" void setup_large_syscall_stack() 
> +{ 
> +    sched::thread::current()->setup_large_syscall_stack(); 
> +} 
> + 
> +extern "C" void free_tiny_syscall_stack() 
> +{ 
> +    sched::thread::current()->free_tiny_syscall_stack(); 
> +} 
> + 
>  } 
>   
>  #endif /* ARCH_SWITCH_HH_ */ 
> diff --git a/arch/x64/arch-tls.hh b/arch/x64/arch-tls.hh 
> index 1bf86fd..6dea9fc 100644 
> --- a/arch/x64/arch-tls.hh 
> +++ b/arch/x64/arch-tls.hh 
> @@ -8,9 +8,24 @@ 
>  #ifndef ARCH_TLS_HH 
>  #define ARCH_TLS_HH 
>   
> +// Don't change the declaration sequence of all existing members'. 
> +// Please add new members from the last. 
>  struct thread_control_block { 
>      thread_control_block* self; 
>      void* tls_base; 
> +    // 
> +    // This field, a per-thread stack for SYSCALL instruction, is  used 
> in 
> +    // arch/x64/entry.S for %fs's offset.  We currently keep this field 
> in the TCB 
> +    // to make it easier to access in assembly code through a known 
> offset at %fs:16. 
> +    // But with more effort, we could have used an ordinary thread-local 
> variable 
> +    // instead and avoided extending the TCB here. 
> +    // 
> +    // The 8 bytes at the address (syscall_stack_top + 8) are used to 
> identify if 
> +    // the stack is tiny (0) or large (1). The 8 bytes at 
> syscall_stack_top are used to 
> +    // store the caller stack pointer (value of RSP when SYSCALL was 
> called). 
> +    // So the size of the syscall stack is in reality smaller by 16 bytes 
> from what 
> +    // was originally allocated. 
> +    void* syscall_stack_top; 
>  }; 
>   
>  #endif /* ARCH_TLS_HH */ 
> diff --git a/arch/x64/entry.S b/arch/x64/entry.S 
> index 04d809d..05b0d56 100644 
> --- a/arch/x64/entry.S 
> +++ b/arch/x64/entry.S 
> @@ -170,26 +170,79 @@ syscall_entry: 
>      .cfi_register rflags, r11 # r11 took previous rflags value 
>      # There is no ring transition and rflags are left unchanged. 
>   
> -    # Skip the "red zone" allowed by the AMD64 ABI (the caller used a 
> -    # SYSCALL instruction and doesn't know he called a function): 
> -    subq $128, %rsp 
> - 
>      # Align the stack to 16 bytes. We align it now because of limitations 
> of 
>      # the CFI language, but need to ensure it is still aligned before we 
> call 
>      # syscall_wrapper(), so must ensure that the number of pushes below 
> are 
>      # even. 
> -    # An additional complication is that we need to restore %rsp later 
> without 
> -    # knowing how it was previously aligned. In the following trick, 
> without 
> -    # using an additional register, the two pushes leave the stack with 
> the 
> -    # same alignment it had originally, and a copy of the original %rsp 
> at 
> -    # (%rsp) and 8(%rsp). The andq then aligns the stack - if it was 
> already 
> -    # 16 byte aligned nothing changes, if it was 8 byte aligned then it 
> -    # subtracts 8 from %rsp, meaning that the original %rsp is now at 
> 8(%rsp) 
> -    # and 16(%rsp). In both cases we can restore it below from 8(%rsp). 
> -    pushq %rsp 
> -    pushq (%rsp) 
> -    andq $-16, %rsp 
> - 
> +    # 
> +    # Switch stack to "tiny" syscall stack that should be large 
> +    # enough to setup "large" syscall stack (only when first SYSCALL on 
> this thread) 
> +    # 
> +    # Unfortunately movq %rsp, (%fs:16) is not a valid operation so we 
> have to 
> +    # copy the syscall stack pointer - %fs:16 - to a scratch register rxb 
> +    # so that eventually we can copy caller stack pointer to the 8 bytes 
> field 
> +    # above the syscall stack pointer. That way we can restore caller 
> stack pointer 
> +    # later on exit. 
> +    # 
> +    # Please note we are saving value of rxb on caller stack which might 
> +    # not be necessary as rbx is considered a scratch register. 
> +    pushq %rbx 
> +    movq %fs:16, %rbx 
> +    movq %rsp, (%rbx) // Save caller stack pointer 
> +    movq %rbx, %rsp   // Switch stack to tiny or large syscall stack 
> + 
> +    # Skip large syscall stack setup if it has been already setup 
> +    cmpq $0, 8(%rbx)  // Check if we are on tiny or large stack 
> +    jne large_stack_has_been_setup 
> + 
> +    # We are on small stack 
> +    # Save all registers 
> +    pushq %rcx 
> +    pushq %rcx # we push rcx twice to make stack 16-bytes aligned 
> +    pushq %rbp 
> +    pushq %rbx 
> +    pushq %rax 
> +    pushq %rdx 
> +    pushq %rsi 
> +    pushq %rdi 
> +    pushq %r8 
> +    pushq %r9 
> +    pushq %r10 
> +    pushq %r11 # contains rflags before syscall instruction 
> +    pushq %r12 
> +    pushq %r13 
> +    pushq %r14 
> +    pushq %r15 
> + 
> +    # Call setup_large_syscall_stack to setup large call stack 
> +    # This function does not take any arguments nor returns anything. 
> +    # It ends up allocating large stack and storing its address in tcb 
> +    callq setup_large_syscall_stack 
> +    movq %fs:16, %rsp  // Switch stack to large stack 
> +    subq $128, %rsp    // Skip 128 bytes of large stack so that we can 
> restore all registers saved above (16 pushes). 
> +                       // Please note that these 128 bytes have been 
> copied by setup_large_syscall_stack function 
> +                       // so that we do not have to pop and then push 
> same registers again. 
> +    callq free_tiny_syscall_stack 
> + 
> +    # Restore all registers 
> +    popq %r15 
> +    popq %r14 
> +    popq %r13 
> +    popq %r12 
> +    popq %r11 
> +    popq %r10 
> +    popq %r9 
> +    popq %r8 
> +    popq %rdi 
> +    popq %rsi 
> +    popq %rdx 
> +    popq %rax 
> +    popq %rbx 
> +    popq %rbp 
> +    popq %rcx 
> +    popq %rcx 
> + 
> +large_stack_has_been_setup: 
>      .cfi_def_cfa %rsp, 0 
>   
>      # We need to save and restore the caller's %rbp anyway, so let's also 
> @@ -204,9 +257,12 @@ syscall_entry: 
>      # We do this just so we can refer to it with CFI and help gdb's DWARF 
>      # stack unwinding. This saving not otherwise needed for correct 
> operation 
>      # (we anyway restore it below by undoing all our modifications). 
> -    movq 24(%rsp), %rbp 
> -    addq $128, %rbp 
> -    pushq %rbp 
> +    # 
> +    # The caller stack rsp was saved at the address of syscall stack 
> pointer. 
> +    # Since we pushed twice since switching rsp to syscall stack pointer 
> +    # we simply have to dereference the value of the caller stack like 
> below. 
> +    pushq 16(%rsp) 
> + 
>      .cfi_adjust_cfa_offset 8 
>      .cfi_rel_offset %rsp, 0 
>   
> @@ -275,16 +331,16 @@ syscall_entry: 
>      popq_cfi %rbp 
>      popq_cfi %rcx 
>   
> -    movq 8(%rsp), %rsp # undo alignment (as explained above) 
> - 
>      # restore rflags 
>      # push the rflag state syscall saved in r11 to the stack 
>      pushq %r11 
>      # pop the stack value in flag register 
>      popfq 
>   
> -    #undo red-zone skip without altering restored flags 
> -    lea 128(%rsp), %rsp 
> +    # Restore caller stack pointer 
> +    movq %fs:16, %rbx 
> +    movq (%rbx), %rsp 
> +    popq %rbx 
>   
>      # jump to rcx where the syscall instruction put rip 
>      # (sysret would leave rxc cloberred so we have nothing to do to 
> restore it) 
> diff --git a/include/osv/sched.hh b/include/osv/sched.hh 
> index dada8f5..033dfae 100644 
> --- a/include/osv/sched.hh 
> +++ b/include/osv/sched.hh 
> @@ -591,7 +591,9 @@ public: 
>        * wait queue for) a crucial mutex (e.g., for I/O or memory 
> allocation), 
>        * which could cause the whole system to block. So use at your own 
> peril. 
>        */ 
> -     bool unsafe_stop(); 
> +    bool unsafe_stop(); 
> +    void setup_large_syscall_stack(); 
> +    void free_tiny_syscall_stack(); 
>  private: 
>      static void wake_impl(detached_state* st, 
>              unsigned allowed_initial_states_mask = 1 << 
> unsigned(status::waiting)); 
> -- 
> 2.7.4 
>
>


-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to osv-dev+unsubscr...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Re: [PATCH v3] Implement switch to separate SYSCALL call stack

Reply via email to