I am expecting this patch will need some tweaking but I think it 
fundamentally implements SYSCALL call stack switch. I have tested in using 
my golang working branch.

First of all I am assembly novice so I am not sure if I have not messed 
some stuff in entry.S. For example I am not sure I have not messed the CFI 
- on other hand given by previous GDB research it might be mute point given 
gdb cannot handle backtracing non-contiguous stacks.

Secondly I am not sure if tiny stack is enough (1024). I am afraid there 
maybe some scenarios where thread which calls setup_large_syscall_stack() 
may be interrupted (or something else) and bigger stack is needed.

For example this is a crash I got only once when the tiny stack was 512 
bytes deep (not sure if increasing to 1024 has fully addressed it as I have 
not seen it anymore):
[backtrace]
0x000000000044d85c <osv::generate_signal(siginfo&, exception_frame*)+140>
0x000000000044d88a <osv::handle_mmap_fault(unsigned long, int, 
exception_frame*)+26>
0x000000000032b3eb <mmu::vm_fault(unsigned long, exception_frame*)+171>
0x000000000038b4e8 <page_fault+136>
0x000000000038a356 <???+3711830>
0x00000000003e5381 <sched::cpu::reschedule_from_interrupt(bool, std::chrono
::duration<long, std::ratio<1l, 1000000000l> 
>)+561>
0x00000000003e583b <sched::cpu::schedule()+27>
0x00000000003e5f01 <sched::thread::wait()+17>
0x00000000003c5957 <lockfree::mutex::lock()+407>
0x00000000003d4885 <???+4016261>
0x00000000003d4a5a <???+4016730>
0xffff8000015d203f <???+22880319>
0x00000000003e5381 <sched::cpu::reschedule_from_interrupt(bool, std::chrono
::duration<long, std::ratio<1l, 1000000000l> 
>)+561>
0x00000000003e583b <sched::cpu::schedule()+27>
0x00000000003e5f01 <sched::thread::wait()+17>
0x00000000003c5957 <lockfree::mutex::lock()+407>
0x00000000003d4885 <???+4016261>
0x00000000003d4a5a <???+4016730>
0x00000000003d4d46 <malloc+70>
0x00000000003e2780 <sched::thread::setup_large_syscall_stack()+48>
0x000000000038b324 <syscall_entry+36>
0x000010000113cd91 <???+18075025>
0x0000100001125e23 <???+17980963>
0x000010000112c35f <???+18006879>
0x000010000112c2e5 <???+18006757>
0x0000100001306bf2 <???+19950578>
0x0000200000ddff3f <???+14548799>
qemu failed.

I also tried to use mmap() and mprotect() (instead of malloc) to allocate 
memory for large stack and it did not quite work (please see commented out 
code in the patch). Everytime I ran it would crash like this (not event 
same thread when mprotect() was called):

0  0x0000000000225930 in abort (fmt=fmt@entry=0x71d020 "exception nested 
too deeply") at runtime.cc:130
#1  0x0000000000388804 in sched::arch_cpu::enter_exception (this=<optimized 
out>) at arch/x64/arch-cpu.cc:19
#2  sched::exception_guard::exception_guard (this=<optimized out>) at 
arch/x64/arch-cpu.cc:37
#3  0x000000000038c6da in general_protection (ef=0xffff800000f2f048) at 
arch/x64/exceptions.cc:313
#4  <signal handler called>
#5  0x000000000038890e in safe_load<frame*> (data=<synthetic pointer>, 
potentially_bad_pointer=0x56415741e5894855) at ar
ch/x64/safe-ptr.hh:33
#6  backtrace_safe (pc=pc@entry=0xffff800000f2d330, nr=nr@entry=128) at 
arch/x64/backtrace.cc:25
#7  0x000000000020203c in print_backtrace () at runtime.cc:79
#8  0x000000000022591c in abort (fmt=fmt@entry=0x71d3a0 "general protection 
fault\n") at runtime.cc:121
#9  0x000000000038c73a in general_protection (ef=0xffff800000f2e048) at 
arch/x64/exceptions.cc:322
#10 <signal handler called>
#11 std::unique_ptr<sched::thread::detached_state, 
std::default_delete<sched::thread::detached_state> >::get (this=0x5bd
8894808c483d0)
    at /usr/include/c++/5/bits/unique_ptr.h:305
#12 
sched::thread::do_wake_with<waiter::wake()::{lambda()#1}>(waiter::wake()::{lambda()#1},
 
unsigned int) (allowed_initi
al_states_mask=24, 
    action=..., this=0x5bd8894808c48348) at include/osv/sched.hh:1261
#13 
sched::thread::wake_with_from_mutex<waiter::wake()::{lambda()#1}>(waiter::wake()::{lambda()#1})
 
(action=..., this=0x
5bd8894808c48348)
    at include/osv/sched.hh:1287
#14 waiter::wake (this=0x33185e 
<mmu::initialized_anonymous_page_provider::fill(void*, unsigned long, 
unsigned long)+30>
)
    at include/osv/wait_record.hh:38
#15 lockfree::mutex::lock (this=0xffffa00002166648, 
this@entry=0xffff800002953040) at core/lfmutex.cc:74
#16 lockfree_mutex_lock (m=m@entry=0xffffa00002166648) at 
core/lfmutex.cc:282
#17 0x00000000002e213b in mutex_lock (m=0xffffa00002166648) at 
include/osv/mutex.h:59
#18 sa_bulk_lookup (hdl=0xffffa00002166648, 
attrs=attrs@entry=0xffff8000029579c0, count=3)
    at bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c:1851
#19 0x0000000000316cd4 in zfs_getattr (vp=0xffffa0000221e780, 
vap=0xffff800002957aa0)
    at bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c:2055
#20 0x000000000042cf2c in vn_stat (vp=vp@entry=0xffffa0000221e780, 
st=st@entry=0xffff800002957b60) at fs/vfs/vfs_vnode.c
c:325
#21 0x000000000043029a in vfs_file::stat (this=<optimized out>, 
st=0xffff800002957b60) at fs/vfs/vfs_fops.cc:118
#22 0x000000000041d48c in size (f=...) at fs/fs.cc:16
#23 0x00000000003300ed in mmu::file_vma::fault (this=0xffffa0000221ed00, 
addr=17592205996032, ef=0xffff800002958098) at 
core/mmu.cc:1689
#24 0x000000000032b484 in mmu::vm_fault (addr=<optimized out>, 
addr@entry=17592205997277, ef=ef@entry=0xffff800002958098
) at core/mmu.cc:1342
#25 0x000000000038b529 in page_fault (ef=0xffff800002958098) at 
arch/x64/mmu.cc:38
#26 <signal handler called>
#27 runtime.memmove () at 
/home/wkozaczuk/tools/go/src/runtime/memmove_amd64.s:161
#28 0x000010000112864f in runtime.recordForPanic (b=...) at 
/home/wkozaczuk/tools/go/src/runtime/print.go:45

When I commented mprotect() (still mmap()) the test app would work but 
occasionally crash (I do not have an example). SO I wonder if there is some 
fundamental constraint which is violated when calling mmap()/mprotect() 
when setting up large stack (called from assembly in tiny stack) comparing 
to regular malloc(). Is it because mprotect() requires updating memory page 
tables and it triggers some logic that messes things up? If I cannot use 
mmap() how do I implement canary type of logic with malloc()?
  
Please find below some of my comments/questions.

Waldek

On Monday, February 5, 2018 at 12:28:27 AM UTC-5, Waldek Kozaczuk wrote:
>
> This patch implements separate syscall call stack needed 
> when runtimes like Golang use SYSCALL instruction to make 
> system call. 
>
> More specifically each application thread pre-alllocates 
> "tiny" (1024 bytes deep) syscall call stack. When SYSCALL 
> instruction is called the call stack is switched to the 
> tiny stack and the "large" stack is allocated if has not 
> been alllocated yet (first call on given thread). In the end 
> SYSCALL ends up being exexuted on the "large" stack. 
>
> This patch is based on the original patch provided by @Hawxchen. 
>
> Fixes #808 
>
> Signed-off-by: Waldemar Kozaczuk <jwkozac...@gmail.com> 
> --- 
>  arch/x64/arch-switch.hh | 61 +++++++++++++++++++++++++++++++++++++ 
>  arch/x64/arch-tls.hh    |  9 ++++++ 
>  arch/x64/entry.S        | 81 
> +++++++++++++++++++++++++++++++++---------------- 
>  include/osv/sched.hh    |  7 +++++ 
>  linux.cc                |  5 +++ 
>  5 files changed, 137 insertions(+), 26 deletions(-) 
>
> diff --git a/arch/x64/arch-switch.hh b/arch/x64/arch-switch.hh 
> index d1a039a..f291a26 100644 
> --- a/arch/x64/arch-switch.hh 
> +++ b/arch/x64/arch-switch.hh 
> @@ -11,6 +11,7 @@ 
>  #include "msr.hh" 
>  #include <osv/barrier.hh> 
>  #include <string.h> 
> +#include <osv/mmu.hh> 
>   
>  extern "C" { 
>  void thread_main(void); 
> @@ -114,6 +115,7 @@ void thread::init_stack() 
>      _state.rip = reinterpret_cast<void*>(thread_main); 
>      _state.rsp = stacktop; 
>      _state.exception_stack = _arch.exception_stack + 
> sizeof(_arch.exception_stack); 
> +    debug("***-> %d, Stack top: %#x\n", this->_id, stacktop); 
>  } 
>   
>  void thread::setup_tcb() 
> @@ -146,6 +148,50 @@ void thread::setup_tcb() 
>      _tcb = static_cast<thread_control_block*>(p + tls.size + 
> user_tls_size); 
>      _tcb->self = _tcb; 
>      _tcb->tls_base = p + user_tls_size; 
> + 
> +    if(is_app()) { 
> +        // 
> +        // Allocate tiny syscall call stack 
> +        auto& tiny_stack = _attr._tiny_syscall_stack; 
> + 
> +        if(!tiny_stack.begin) { 
> +            tiny_stack.size = 1024; //512 seems to be too small sometimes 
> +            tiny_stack.begin = malloc(tiny_stack.size); 
> +            tiny_stack.deleter = tiny_stack.default_deleter; 
> +        } 
> + 
> +        _tcb->tiny_syscall_stack_addr = tiny_stack.begin + 
> tiny_stack.size; 
> +        _tcb->large_syscall_stack_addr = nullptr; 
> +        debug("***-> %d, Tiny syscall stack top: %#x with size: %d 
> bytes\n", this->_id, 
> +              _tcb->tiny_syscall_stack_addr, tiny_stack.size); 
> +    } 
> +} 
> + 
> +void thread::setup_large_syscall_stack() 
> +{ 
> +    auto& large_stack = _attr._large_syscall_stack; 
> +    if(large_stack.begin) { 
> +        return; 
> +    } 
> + 
> +    large_stack.size = 16 * PAGE_SIZE; 
> + 
> +    /* 
> +     * Mmapping (without mprotect()) seems to work but occasionally leads 
> to crashes ( 
> +    large_stack.begin = mmu::map_anon(nullptr, large_stack.size, 
> mmu::mmap_populate, mmu::perm_rw); 
> +    //mmu::mprotect(large_stack.begin, PAGE_SIZE, 0); //THIS DOES NOT 
> WORK at all -> results in crash 
> +    large_stack.deleter = free_large_syscall_stack; 
> +    */ 
> + 
> +    large_stack.begin = malloc(large_stack.size); 
> +    large_stack.deleter = large_stack.default_deleter; 
> + 
> +    _tcb->large_syscall_stack_addr = large_stack.begin + 
> large_stack.size; 
> +} 
> + 
> +void thread::free_large_syscall_stack(sched::thread::stack_info si) 
> +{ 
> +    mmu::munmap(si.begin, si.size); 
>  } 
>   
>  void thread::free_tcb() 
> @@ -156,6 +202,21 @@ void thread::free_tcb() 
>      } else { 
>          free(_tcb->tls_base); 
>      } 
> + 
> +    if(is_app()) { 
> +        auto& tiny_stack = _attr._tiny_syscall_stack; 
> + 
> +        assert(tiny_stack.begin); 
> + 
> +        if(tiny_stack.deleter) { 
> +            tiny_stack.deleter(tiny_stack); 
> +        } 
> + 
> +        auto& large_stack = _attr._large_syscall_stack; 
> +        if(large_stack.begin && large_stack.deleter) { 
> +            large_stack.deleter(large_stack); 
> +        } 
> +    } 
>  } 
>   
>  void thread_main_c(thread* t) 
> diff --git a/arch/x64/arch-tls.hh b/arch/x64/arch-tls.hh 
> index 1bf86fd..64a1788 100644 
> --- a/arch/x64/arch-tls.hh 
> +++ b/arch/x64/arch-tls.hh 
> @@ -8,9 +8,18 @@ 
>  #ifndef ARCH_TLS_HH 
>  #define ARCH_TLS_HH 
>   
> +// Don't change the declaration sequence of all existing members'. 
> +// Please add new members from the last. 
>  struct thread_control_block { 
>      thread_control_block* self; 
>      void* tls_base; 
> +    // These fields, a per-thread stack for SYSCALL instruction, 
> +    // are used in arch/x64/entry.S for %fs's offset. 
> +    // We currently keep these fields in the TCB to make it easier to 
> access in assembly code 
> +    // through a known offset into %fs. But with more effort, we could 
> have used an 
> +    // ordinary thread-local variable instead and avoided extending the 
> TCB here. 
> +    void* tiny_syscall_stack_addr; 
> +    void* large_syscall_stack_addr; 
>
 
 Should we instead store single pointer to a struct that has pointers to 
both tiny and large stack?

>  }; 
>   
>  #endif /* ARCH_TLS_HH */ 
> diff --git a/arch/x64/entry.S b/arch/x64/entry.S 
> index 04d809d..87f37b8 100644 
> --- a/arch/x64/entry.S 
> +++ b/arch/x64/entry.S 
> @@ -170,25 +170,57 @@ syscall_entry: 
>      .cfi_register rflags, r11 # r11 took previous rflags value 
>      # There is no ring transition and rflags are left unchanged. 
>   
> -    # Skip the "red zone" allowed by the AMD64 ABI (the caller used a 
> -    # SYSCALL instruction and doesn't know he called a function): 
> -    subq $128, %rsp 
> - 
> -    # Align the stack to 16 bytes. We align it now because of limitations 
> of 
> -    # the CFI language, but need to ensure it is still aligned before we 
> call 
> -    # syscall_wrapper(), so must ensure that the number of pushes below 
> are 
> -    # even. 
> -    # An additional complication is that we need to restore %rsp later 
> without 
> -    # knowing how it was previously aligned. In the following trick, 
> without 
> -    # using an additional register, the two pushes leave the stack with 
> the 
> -    # same alignment it had originally, and a copy of the original %rsp 
> at 
> -    # (%rsp) and 8(%rsp). The andq then aligns the stack - if it was 
> already 
> -    # 16 byte aligned nothing changes, if it was 8 byte aligned then it 
> -    # subtracts 8 from %rsp, meaning that the original %rsp is now at 
> 8(%rsp) 
> -    # and 16(%rsp). In both cases we can restore it below from 8(%rsp). 
> -    pushq %rsp 
> -    pushq (%rsp) 
> -    andq $-16, %rsp 
> +    # Switch stack to "tiny" syscall stack that should be large 
> +    # enough to setup "large" syscall stack (only when first SYSCALL on 
> this thread) 
> +    xchgq %rsp, %fs:16 
> + 
> +    # Skip setup large stack if has already happenned 
> +    pushq %rcx 
> +    movq %fs:24, %rcx 
> +    cmpq $0, %rcx 
> +    jne large_stack_has_been_setup 
> + 
>
 
Am I saving too many registers? 

+    # Save all registers (maybe it saves to many registers) 
> +    pushq %rbp 
> +    pushq %rbx 
> +    pushq %rax 
> +    pushq %rdx 
> +    pushq %rsi 
> +    pushq %rdi 
> +    pushq %r8 
> +    pushq %r9 
> +    pushq %r10 
> +    pushq %r11 # contains rflags before syscall instruction 
> +    pushq %r12 
> +    pushq %r13 
> +    pushq %r14 
> +    pushq %r15 
> + 
> +    # Call setup_large_syscall_stack to prepare large call stack 
> +    # This function does not take any arguments nor returns 
> +    # It it ends up allocating large stack it will store its address in 
> tcb 
> +    callq setup_large_syscall_stack 
> + 
> +    # Restore all registers 
> +    popq %r15 
> +    popq %r14 
> +    popq %r13 
> +    popq %r12 
> +    popq %r11 
> +    popq %r10 
> +    popq %r9 
> +    popq %r8 
> +    popq %rdi 
> +    popq %rsi 
> +    popq %rdx 
> +    popq %rax 
> +    popq %rbx 
> +    popq %rbp 
> + 
> +large_stack_has_been_setup: 
> +    popq %rcx 
> +    # Switch stack to "large" syscall stack 
> +    movq %fs:24, %rsp 
>   
>      .cfi_def_cfa %rsp, 0 
>   
> @@ -204,9 +236,8 @@ syscall_entry: 
>      # We do this just so we can refer to it with CFI and help gdb's DWARF 
>      # stack unwinding. This saving not otherwise needed for correct 
> operation 
>      # (we anyway restore it below by undoing all our modifications). 
> -    movq 24(%rsp), %rbp 
> -    addq $128, %rbp 
> -    pushq %rbp 
> +    pushq %fs:24 
> + 
>      .cfi_adjust_cfa_offset 8 
>      .cfi_rel_offset %rsp, 0 
>   
> @@ -275,16 +306,14 @@ syscall_entry: 
>      popq_cfi %rbp 
>      popq_cfi %rcx 
>   
> -    movq 8(%rsp), %rsp # undo alignment (as explained above) 
> - 
>      # restore rflags 
>      # push the rflag state syscall saved in r11 to the stack 
>      pushq %r11 
>      # pop the stack value in flag register 
>      popfq 
>   
> -    #undo red-zone skip without altering restored flags 
> -    lea 128(%rsp), %rsp 
> +    # Restore original stack pointer 
> +    xchgq %rsp, %fs:16 
>   
>      # jump to rcx where the syscall instruction put rip 
>      # (sysret would leave rxc cloberred so we have nothing to do to 
> restore it) 
> diff --git a/include/osv/sched.hh b/include/osv/sched.hh 
> index dada8f5..dfbd3bb 100644 
> --- a/include/osv/sched.hh 
> +++ b/include/osv/sched.hh 
> @@ -338,6 +338,10 @@ public: 
>      }; 
>      struct attr { 
>          stack_info _stack; 
> +        // These stacks are used only for application threads during 
> SYSCALL instruction. 
> +        // See issue #808 for why it's needed. 
> +        stack_info _tiny_syscall_stack{}; // Initialized with zero since 
> C++11. 
> +        stack_info _large_syscall_stack{}; // Initialized with zero since 
> C++11. 
>          cpu *_pinned_cpu; 
>          bool _detached; 
>          std::array<char, 16> _name = {}; 
> @@ -592,6 +596,7 @@ public: 
>        * which could cause the whole system to block. So use at your own 
> peril. 
>        */ 
>       bool unsafe_stop(); 
> +     void setup_large_syscall_stack(); 
>  private: 
>      static void wake_impl(detached_state* st, 
>              unsigned allowed_initial_states_mask = 1 << 
> unsigned(status::waiting)); 
> @@ -618,6 +623,8 @@ private: 
>      friend void start_early_threads(); 
>      void* do_remote_thread_local_var(void* var); 
>      thread_handle handle(); 
> +    static void free_large_syscall_stack(sched::thread::stack_info si); 
> + 
>  public: 
>      template <typename T> 
>      T& remote_thread_local_var(T& var) 
> diff --git a/linux.cc b/linux.cc 
> index 25d28ee..3edd598 100644 
> --- a/linux.cc 
> +++ b/linux.cc 
> @@ -430,3 +430,8 @@ extern "C" long syscall_wrapper(long number, long p1, 
> long p2, long p3, long p4, 
>      } 
>      return ret; 
>  } 
> + 
> +extern "C" void setup_large_syscall_stack() 
> +{ 
> +    sched::thread::current()->setup_large_syscall_stack(); 
> +} 
> -- 
> 2.7.4 
>
>

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to osv-dev+unsubscr...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Reply via email to