This patch implements separate syscall call stack needed
when runtimes like Golang use SYSCALL instruction to make 
system call.

More specifically each application thread pre-alllocates 
"tiny" (1024 bytes deep) syscall call stack. When SYSCALL 
instruction is called the call stack is switched to the 
tiny stack and the "large" stack is allocated if has not 
been alllocated yet (first call on given thread). In the end 
SYSCALL ends up being exexuted on the "large" stack.

This patch is based on the original patch provided by @Hawxchen. 

Fixes #808

Signed-off-by: Waldemar Kozaczuk <jwkozac...@gmail.com>
---
 arch/x64/arch-switch.hh | 61 +++++++++++++++++++++++++++++++++++++
 arch/x64/arch-tls.hh    |  9 ++++++
 arch/x64/entry.S        | 81 +++++++++++++++++++++++++++++++++----------------
 include/osv/sched.hh    |  7 +++++
 linux.cc                |  5 +++
 5 files changed, 137 insertions(+), 26 deletions(-)

diff --git a/arch/x64/arch-switch.hh b/arch/x64/arch-switch.hh
index d1a039a..f291a26 100644
--- a/arch/x64/arch-switch.hh
+++ b/arch/x64/arch-switch.hh
@@ -11,6 +11,7 @@
 #include "msr.hh"
 #include <osv/barrier.hh>
 #include <string.h>
+#include <osv/mmu.hh>
 
 extern "C" {
 void thread_main(void);
@@ -114,6 +115,7 @@ void thread::init_stack()
     _state.rip = reinterpret_cast<void*>(thread_main);
     _state.rsp = stacktop;
     _state.exception_stack = _arch.exception_stack + 
sizeof(_arch.exception_stack);
+    debug("***-> %d, Stack top: %#x\n", this->_id, stacktop);
 }
 
 void thread::setup_tcb()
@@ -146,6 +148,50 @@ void thread::setup_tcb()
     _tcb = static_cast<thread_control_block*>(p + tls.size + user_tls_size);
     _tcb->self = _tcb;
     _tcb->tls_base = p + user_tls_size;
+
+    if(is_app()) {
+        //
+        // Allocate tiny syscall call stack
+        auto& tiny_stack = _attr._tiny_syscall_stack;
+
+        if(!tiny_stack.begin) {
+            tiny_stack.size = 1024; //512 seems to be too small sometimes
+            tiny_stack.begin = malloc(tiny_stack.size);
+            tiny_stack.deleter = tiny_stack.default_deleter;
+        }
+
+        _tcb->tiny_syscall_stack_addr = tiny_stack.begin + tiny_stack.size;
+        _tcb->large_syscall_stack_addr = nullptr;
+        debug("***-> %d, Tiny syscall stack top: %#x with size: %d bytes\n", 
this->_id,
+              _tcb->tiny_syscall_stack_addr, tiny_stack.size);
+    }
+}
+
+void thread::setup_large_syscall_stack()
+{
+    auto& large_stack = _attr._large_syscall_stack;
+    if(large_stack.begin) {
+        return;
+    }
+
+    large_stack.size = 16 * PAGE_SIZE;
+
+    /*
+     * Mmapping (without mprotect()) seems to work but occasionally leads to 
crashes (
+    large_stack.begin = mmu::map_anon(nullptr, large_stack.size, 
mmu::mmap_populate, mmu::perm_rw);
+    //mmu::mprotect(large_stack.begin, PAGE_SIZE, 0); //THIS DOES NOT WORK at 
all -> results in crash
+    large_stack.deleter = free_large_syscall_stack;
+    */
+
+    large_stack.begin = malloc(large_stack.size);
+    large_stack.deleter = large_stack.default_deleter;
+
+    _tcb->large_syscall_stack_addr = large_stack.begin + large_stack.size;
+}
+
+void thread::free_large_syscall_stack(sched::thread::stack_info si)
+{
+    mmu::munmap(si.begin, si.size);
 }
 
 void thread::free_tcb()
@@ -156,6 +202,21 @@ void thread::free_tcb()
     } else {
         free(_tcb->tls_base);
     }
+
+    if(is_app()) {
+        auto& tiny_stack = _attr._tiny_syscall_stack;
+
+        assert(tiny_stack.begin);
+
+        if(tiny_stack.deleter) {
+            tiny_stack.deleter(tiny_stack);
+        }
+
+        auto& large_stack = _attr._large_syscall_stack;
+        if(large_stack.begin && large_stack.deleter) {
+            large_stack.deleter(large_stack);
+        }
+    }
 }
 
 void thread_main_c(thread* t)
diff --git a/arch/x64/arch-tls.hh b/arch/x64/arch-tls.hh
index 1bf86fd..64a1788 100644
--- a/arch/x64/arch-tls.hh
+++ b/arch/x64/arch-tls.hh
@@ -8,9 +8,18 @@
 #ifndef ARCH_TLS_HH
 #define ARCH_TLS_HH
 
+// Don't change the declaration sequence of all existing members'.
+// Please add new members from the last.
 struct thread_control_block {
     thread_control_block* self;
     void* tls_base;
+    // These fields, a per-thread stack for SYSCALL instruction,
+    // are used in arch/x64/entry.S for %fs's offset.
+    // We currently keep these fields in the TCB to make it easier to access 
in assembly code
+    // through a known offset into %fs. But with more effort, we could have 
used an
+    // ordinary thread-local variable instead and avoided extending the TCB 
here.
+    void* tiny_syscall_stack_addr;
+    void* large_syscall_stack_addr;
 };
 
 #endif /* ARCH_TLS_HH */
diff --git a/arch/x64/entry.S b/arch/x64/entry.S
index 04d809d..87f37b8 100644
--- a/arch/x64/entry.S
+++ b/arch/x64/entry.S
@@ -170,25 +170,57 @@ syscall_entry:
     .cfi_register rflags, r11 # r11 took previous rflags value
     # There is no ring transition and rflags are left unchanged.
 
-    # Skip the "red zone" allowed by the AMD64 ABI (the caller used a
-    # SYSCALL instruction and doesn't know he called a function):
-    subq $128, %rsp
-
-    # Align the stack to 16 bytes. We align it now because of limitations of
-    # the CFI language, but need to ensure it is still aligned before we call
-    # syscall_wrapper(), so must ensure that the number of pushes below are
-    # even.
-    # An additional complication is that we need to restore %rsp later without
-    # knowing how it was previously aligned. In the following trick, without
-    # using an additional register, the two pushes leave the stack with the
-    # same alignment it had originally, and a copy of the original %rsp at
-    # (%rsp) and 8(%rsp). The andq then aligns the stack - if it was already
-    # 16 byte aligned nothing changes, if it was 8 byte aligned then it
-    # subtracts 8 from %rsp, meaning that the original %rsp is now at 8(%rsp)
-    # and 16(%rsp). In both cases we can restore it below from 8(%rsp).
-    pushq %rsp
-    pushq (%rsp)
-    andq $-16, %rsp
+    # Switch stack to "tiny" syscall stack that should be large
+    # enough to setup "large" syscall stack (only when first SYSCALL on this 
thread)
+    xchgq %rsp, %fs:16
+
+    # Skip setup large stack if has already happenned
+    pushq %rcx
+    movq %fs:24, %rcx
+    cmpq $0, %rcx
+    jne large_stack_has_been_setup
+
+    # Save all registers (maybe it saves to many registers)
+    pushq %rbp
+    pushq %rbx
+    pushq %rax
+    pushq %rdx
+    pushq %rsi
+    pushq %rdi
+    pushq %r8
+    pushq %r9
+    pushq %r10
+    pushq %r11 # contains rflags before syscall instruction
+    pushq %r12
+    pushq %r13
+    pushq %r14
+    pushq %r15
+
+    # Call setup_large_syscall_stack to prepare large call stack
+    # This function does not take any arguments nor returns
+    # It it ends up allocating large stack it will store its address in tcb
+    callq setup_large_syscall_stack
+
+    # Restore all registers
+    popq %r15
+    popq %r14
+    popq %r13
+    popq %r12
+    popq %r11
+    popq %r10
+    popq %r9
+    popq %r8
+    popq %rdi
+    popq %rsi
+    popq %rdx
+    popq %rax
+    popq %rbx
+    popq %rbp
+
+large_stack_has_been_setup:
+    popq %rcx
+    # Switch stack to "large" syscall stack
+    movq %fs:24, %rsp
 
     .cfi_def_cfa %rsp, 0
 
@@ -204,9 +236,8 @@ syscall_entry:
     # We do this just so we can refer to it with CFI and help gdb's DWARF
     # stack unwinding. This saving not otherwise needed for correct operation
     # (we anyway restore it below by undoing all our modifications).
-    movq 24(%rsp), %rbp
-    addq $128, %rbp
-    pushq %rbp
+    pushq %fs:24
+
     .cfi_adjust_cfa_offset 8
     .cfi_rel_offset %rsp, 0
 
@@ -275,16 +306,14 @@ syscall_entry:
     popq_cfi %rbp
     popq_cfi %rcx
 
-    movq 8(%rsp), %rsp # undo alignment (as explained above)
-
     # restore rflags
     # push the rflag state syscall saved in r11 to the stack
     pushq %r11
     # pop the stack value in flag register
     popfq
 
-    #undo red-zone skip without altering restored flags
-    lea 128(%rsp), %rsp
+    # Restore original stack pointer
+    xchgq %rsp, %fs:16
 
     # jump to rcx where the syscall instruction put rip
     # (sysret would leave rxc cloberred so we have nothing to do to restore it)
diff --git a/include/osv/sched.hh b/include/osv/sched.hh
index dada8f5..dfbd3bb 100644
--- a/include/osv/sched.hh
+++ b/include/osv/sched.hh
@@ -338,6 +338,10 @@ public:
     };
     struct attr {
         stack_info _stack;
+        // These stacks are used only for application threads during SYSCALL 
instruction.
+        // See issue #808 for why it's needed.
+        stack_info _tiny_syscall_stack{}; // Initialized with zero since C++11.
+        stack_info _large_syscall_stack{}; // Initialized with zero since 
C++11.
         cpu *_pinned_cpu;
         bool _detached;
         std::array<char, 16> _name = {};
@@ -592,6 +596,7 @@ public:
       * which could cause the whole system to block. So use at your own peril.
       */
      bool unsafe_stop();
+     void setup_large_syscall_stack();
 private:
     static void wake_impl(detached_state* st,
             unsigned allowed_initial_states_mask = 1 << 
unsigned(status::waiting));
@@ -618,6 +623,8 @@ private:
     friend void start_early_threads();
     void* do_remote_thread_local_var(void* var);
     thread_handle handle();
+    static void free_large_syscall_stack(sched::thread::stack_info si);
+
 public:
     template <typename T>
     T& remote_thread_local_var(T& var)
diff --git a/linux.cc b/linux.cc
index 25d28ee..3edd598 100644
--- a/linux.cc
+++ b/linux.cc
@@ -430,3 +430,8 @@ extern "C" long syscall_wrapper(long number, long p1, long 
p2, long p3, long p4,
     }
     return ret;
 }
+
+extern "C" void setup_large_syscall_stack()
+{
+    sched::thread::current()->setup_large_syscall_stack();
+}
-- 
2.7.4

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to osv-dev+unsubscr...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Reply via email to