This patch implements separate syscall call stack needed
when runtimes like Golang use SYSCALL instruction to execute
system calls. More specifically in case of Golang tiny stacks
used by coroutines are not deep enough to execute system
calls on OSv which handles them as regular function calls.
In case of OS like Linux tiny Golang stacks are not a problem as
the system calls are executed on separate kernel stack. For example
golang-httpserver app should function now well under load (at least 100
concurent requests per second using ab).

More specifically each application thread pre-alllocates
"tiny" (1024-bytes deep) syscall call stack. When SYSCALL
instruction is called first time for given thread the call stack
is switched to the tiny syscall stack in order to setup the "large"
stack and switch to. Eventually execution of syscall continues
on large syscall stack. From this point, second and subsequent
executions of SYSCALL instruction are handled on large syscall stack.

Please note that because the stack is switched from original
caller stack to the syscall stack that is allocated lazily
the stack pointers "jumps" to higher addresses. From gdb
perspective this results in "Backtrace stopped: previous
frame inner to this frame (corrupt stack?)" message.
Gdb assumes that stack pointer should decrease in value as the stack "grows"
which is not the case when stack is switched to syscall stack.

This patch is based on the original patch provided by @Hawxchen.

Fixes #808

Signed-off-by: Waldemar Kozaczuk <jwkozac...@gmail.com>
---
 arch/x64/arch-switch.hh | 108 ++++++++++++++++++++++++++++++++++++++++++++++++
 arch/x64/arch-tls.hh    |  19 +++++++++
 arch/x64/entry.S        |  95 +++++++++++++++++++++++++++++-------------
 include/osv/sched.hh    |   4 +-
 4 files changed, 197 insertions(+), 29 deletions(-)

diff --git a/arch/x64/arch-switch.hh b/arch/x64/arch-switch.hh
index d1a039a..e376954 100644
--- a/arch/x64/arch-switch.hh
+++ b/arch/x64/arch-switch.hh
@@ -12,6 +12,37 @@
 #include <osv/barrier.hh>
 #include <string.h>
 
+//
+// The last 16 bytes of the syscall stack are reserved for -
+// tiny/large indicator and extra 8 bytes to make it 16 bytes aligned
+// as Linux x64 ABI mandates.
+#define SYSCALL_STACK_RESERVED_SPACE_SIZE (2 * 8)
+//
+// The tiny stack has to be large enough to allow for execution of
+// thread::setup_large_syscall_stack() that allocates and sets up
+// large syscall stack. It was measured that as of this writing
+// setup_large_syscall_stack() needs a little over 600 bytes of stack
+// to properly operate. This makes 1024 bytes to be an adequate size
+// of tiny stack.
+// All application threads pre-allocate tiny syscall stack so there
+// is a tiny penalty with this solution.
+#define TINY_SYSCALL_STACK_SIZE 1024
+#define TINY_SYSCALL_STACK_DEPTH (TINY_SYSCALL_STACK_SIZE - 
SYSCALL_STACK_RESERVED_SPACE_SIZE)
+//
+// The large syscall stack is setup and switched to on first
+// execution of SYSCALL instruction for given application thread.
+#define LARGE_SYSCALL_STACK_SIZE (16 * PAGE_SIZE)
+#define LARGE_SYSCALL_STACK_DEPTH (LARGE_SYSCALL_STACK_SIZE - 
SYSCALL_STACK_RESERVED_SPACE_SIZE)
+
+#define SET_SYSCALL_STACK_TYPE_INDICATOR(value) \
+*((long*)(_tcb->syscall_stack_top)) = value;
+
+#define GET_SYSCALL_STACK_TYPE_INDICATOR() \
+*((long*)(_tcb->syscall_stack_top))
+
+#define TINY_SYSCALL_STACK_INDICATOR 0l
+#define LARGE_SYSCALL_STACK_INDICATOR 1l
+
 extern "C" {
 void thread_main(void);
 void thread_main_c(sched::thread* t);
@@ -146,6 +177,66 @@ void thread::setup_tcb()
     _tcb = static_cast<thread_control_block*>(p + tls.size + user_tls_size);
     _tcb->self = _tcb;
     _tcb->tls_base = p + user_tls_size;
+
+    if (is_app()) {
+        //
+        // Allocate TINY syscall call stack
+        void* tiny_syscall_stack_begin = malloc(TINY_SYSCALL_STACK_SIZE);
+        assert(tiny_syscall_stack_begin);
+        //
+        // The top of the stack needs to be 16 bytes lower to make space for
+        // OSv syscall stack type indicator and extra 8 bytes to make it 
16-bytes aligned
+        _tcb->syscall_stack_top = tiny_syscall_stack_begin + 
TINY_SYSCALL_STACK_DEPTH;
+        SET_SYSCALL_STACK_TYPE_INDICATOR(TINY_SYSCALL_STACK_INDICATOR);
+    }
+    else {
+        _tcb->syscall_stack_top = 0;
+    }
+}
+
+void thread::setup_large_syscall_stack()
+{
+    assert(is_app());
+    assert(GET_SYSCALL_STACK_TYPE_INDICATOR() == TINY_SYSCALL_STACK_INDICATOR);
+    //
+    // Allocate LARGE syscall stack
+    void* large_syscall_stack_begin = malloc(LARGE_SYSCALL_STACK_SIZE);
+    void* large_syscall_stack_top = large_syscall_stack_begin + 
LARGE_SYSCALL_STACK_DEPTH;
+    //
+    // Copy all of the tiny stack to the are of last 1024 bytes of large stack.
+    // This way we do not have to pop and push the same registers to be saved 
again.
+    // Also the caller stack pointer is also copied.
+    // We could have copied only last 128 (registers) + 16 bytes (2 fields) 
instead
+    // of all of the stack but copying 1024 is simpler and happens
+    // only once per thread.
+    void* tiny_syscall_stack_top = _tcb->syscall_stack_top;
+    memcpy(large_syscall_stack_top - TINY_SYSCALL_STACK_DEPTH,
+           tiny_syscall_stack_top - TINY_SYSCALL_STACK_DEPTH, 
TINY_SYSCALL_STACK_SIZE);
+    //
+    // Save beginning of tiny stack at the bottom of LARGE stack so
+    // that we can deallocate it in free_tiny_syscall_stack
+    *((void**)large_syscall_stack_begin) = tiny_syscall_stack_top - 
TINY_SYSCALL_STACK_DEPTH;
+    //
+    // Set canary value (0xDEADBEAFDEADBEAF) under bottom + 8 of LARGE stack
+    *((long*)(large_syscall_stack_begin + 8)) = 0xdeadbeafdeadbeaf;
+    //
+    // Switch syscall stack address value in TCB to the top of the LARGE one
+    _tcb->syscall_stack_top = large_syscall_stack_top;
+    SET_SYSCALL_STACK_TYPE_INDICATOR(LARGE_SYSCALL_STACK_INDICATOR);
+}
+
+void thread::free_tiny_syscall_stack()
+{
+    assert(is_app());
+    assert(GET_SYSCALL_STACK_TYPE_INDICATOR() == 
LARGE_SYSCALL_STACK_INDICATOR);
+
+    void* large_syscall_stack_top = _tcb->syscall_stack_top;
+    void* large_syscall_stack_begin = large_syscall_stack_top - 
LARGE_SYSCALL_STACK_DEPTH;
+    //
+    // Lookup address of tiny stack saved by setup_large_syscall_stack()
+    // at the bottom of LARGE stack (current syscall stack)
+    void* tiny_syscall_stack_begin = *((void**)large_syscall_stack_begin);
+    free(tiny_syscall_stack_begin);
 }
 
 void thread::free_tcb()
@@ -156,6 +247,13 @@ void thread::free_tcb()
     } else {
         free(_tcb->tls_base);
     }
+
+    if (_tcb->syscall_stack_top) {
+        void* syscall_stack_begin = GET_SYSCALL_STACK_TYPE_INDICATOR() == 
TINY_SYSCALL_STACK_INDICATOR ?
+            _tcb->syscall_stack_top - TINY_SYSCALL_STACK_DEPTH :
+            _tcb->syscall_stack_top - LARGE_SYSCALL_STACK_DEPTH;
+        free(syscall_stack_begin);
+    }
 }
 
 void thread_main_c(thread* t)
@@ -171,6 +269,16 @@ void thread_main_c(thread* t)
     t->complete();
 }
 
+extern "C" void setup_large_syscall_stack()
+{
+    sched::thread::current()->setup_large_syscall_stack();
+}
+
+extern "C" void free_tiny_syscall_stack()
+{
+    sched::thread::current()->free_tiny_syscall_stack();
+}
+
 }
 
 #endif /* ARCH_SWITCH_HH_ */
diff --git a/arch/x64/arch-tls.hh b/arch/x64/arch-tls.hh
index 1bf86fd..82871d6 100644
--- a/arch/x64/arch-tls.hh
+++ b/arch/x64/arch-tls.hh
@@ -8,9 +8,28 @@
 #ifndef ARCH_TLS_HH
 #define ARCH_TLS_HH
 
+// Don't change the declaration sequence of all existing members'.
+// Please add new members from the last.
 struct thread_control_block {
     thread_control_block* self;
     void* tls_base;
+    //
+    // This field, a per-thread stack for SYSCALL instruction, is  used in
+    // arch/x64/entry.S for %fs's offset.  We currently keep this field in the 
TCB
+    // to make it easier to access in assembly code through a known offset at 
%fs:16.
+    // But with more effort, we could have used an ordinary thread-local 
variable
+    // instead and avoided extending the TCB here.
+    //
+    // The 8 bytes at the top of the syscall stack are used to identify if
+    // the stack is tiny (0) or large (1). So the size of the syscall stack is 
in
+    // reality smaller by 16 bytes from what was originally allocated because 
we need
+    // to make it 16-bytes aligned.
+    void* syscall_stack_top;
+    //
+    // This field is used to store the syscall caller stack pointer (value of 
RSP when
+    // SYSCALL was called) so that it can be restored when syscall completed.
+    // Same as above this field could be an ordinary thread-local variable.
+    void* syscall_caller_stack_pointer;
 };
 
 #endif /* ARCH_TLS_HH */
diff --git a/arch/x64/entry.S b/arch/x64/entry.S
index 04d809d..0c7164e 100644
--- a/arch/x64/entry.S
+++ b/arch/x64/entry.S
@@ -169,27 +169,69 @@ syscall_entry:
     .cfi_register rip, rcx # rcx took previous rip value
     .cfi_register rflags, r11 # r11 took previous rflags value
     # There is no ring transition and rflags are left unchanged.
-
-    # Skip the "red zone" allowed by the AMD64 ABI (the caller used a
-    # SYSCALL instruction and doesn't know he called a function):
-    subq $128, %rsp
-
-    # Align the stack to 16 bytes. We align it now because of limitations of
-    # the CFI language, but need to ensure it is still aligned before we call
-    # syscall_wrapper(), so must ensure that the number of pushes below are
-    # even.
-    # An additional complication is that we need to restore %rsp later without
-    # knowing how it was previously aligned. In the following trick, without
-    # using an additional register, the two pushes leave the stack with the
-    # same alignment it had originally, and a copy of the original %rsp at
-    # (%rsp) and 8(%rsp). The andq then aligns the stack - if it was already
-    # 16 byte aligned nothing changes, if it was 8 byte aligned then it
-    # subtracts 8 from %rsp, meaning that the original %rsp is now at 8(%rsp)
-    # and 16(%rsp). In both cases we can restore it below from 8(%rsp).
-    pushq %rsp
-    pushq (%rsp)
-    andq $-16, %rsp
-
+    #
+    # Unfortunately the mov instruction cannot be used to dereference an 
address
+    # on syscall stack pointed by address in TCB (%fs:16) - double memory 
dereference.
+    # Therefore we are forced to save caller stack address in a field in TCB.
+    movq %rsp, %fs:24 # syscall_caller_stack_pointer
+    #
+    # Switch stack to "tiny" syscall stack that should be large
+    # enough to setup "large" syscall stack (only when first SYSCALL on this 
thread)
+    movq %fs:16, %rsp
+
+    # Skip large syscall stack setup if it has been already setup
+    cmpq $0, (%rsp)  // Check if we are on tiny or large stack
+    jne large_stack_has_been_setup
+
+    # We are on small stack
+    # Save all registers
+    pushq %rcx
+    pushq %rcx
+    pushq %rbp
+    pushq %rbx
+    pushq %rax
+    pushq %rdx
+    pushq %rsi
+    pushq %rdi
+    pushq %r8
+    pushq %r9
+    pushq %r10
+    pushq %r11 # contains rflags before syscall instruction
+    pushq %r12
+    pushq %r13
+    pushq %r14
+    pushq %r15
+    # Please note we pushed rcx twice to make stack 16-bytes aligned
+
+    # Call setup_large_syscall_stack to setup large call stack
+    # This function does not take any arguments nor returns anything.
+    # It ends up allocating large stack and storing its address in tcb
+    callq setup_large_syscall_stack
+    movq %fs:16, %rsp  // Switch stack to large stack
+    subq $128, %rsp    // Skip 128 bytes of large stack so that we can restore 
all registers saved above (16 pushes).
+                       // Please note that these 128 bytes have been copied by 
setup_large_syscall_stack function
+                       // so that we do not have to pop and then push same 
registers again.
+    callq free_tiny_syscall_stack
+
+    # Restore all registers
+    popq %r15
+    popq %r14
+    popq %r13
+    popq %r12
+    popq %r11
+    popq %r10
+    popq %r9
+    popq %r8
+    popq %rdi
+    popq %rsi
+    popq %rdx
+    popq %rax
+    popq %rbx
+    popq %rbp
+    popq %rcx
+    popq %rcx
+
+large_stack_has_been_setup:
     .cfi_def_cfa %rsp, 0
 
     # We need to save and restore the caller's %rbp anyway, so let's also
@@ -204,9 +246,8 @@ syscall_entry:
     # We do this just so we can refer to it with CFI and help gdb's DWARF
     # stack unwinding. This saving not otherwise needed for correct operation
     # (we anyway restore it below by undoing all our modifications).
-    movq 24(%rsp), %rbp
-    addq $128, %rbp
-    pushq %rbp
+    pushq %fs:24
+
     .cfi_adjust_cfa_offset 8
     .cfi_rel_offset %rsp, 0
 
@@ -275,16 +316,14 @@ syscall_entry:
     popq_cfi %rbp
     popq_cfi %rcx
 
-    movq 8(%rsp), %rsp # undo alignment (as explained above)
-
     # restore rflags
     # push the rflag state syscall saved in r11 to the stack
     pushq %r11
     # pop the stack value in flag register
     popfq
 
-    #undo red-zone skip without altering restored flags
-    lea 128(%rsp), %rsp
+    # Restore caller stack pointer
+    movq %fs:24, %rsp
 
     # jump to rcx where the syscall instruction put rip
     # (sysret would leave rxc cloberred so we have nothing to do to restore it)
diff --git a/include/osv/sched.hh b/include/osv/sched.hh
index dada8f5..033dfae 100644
--- a/include/osv/sched.hh
+++ b/include/osv/sched.hh
@@ -591,7 +591,9 @@ public:
       * wait queue for) a crucial mutex (e.g., for I/O or memory allocation),
       * which could cause the whole system to block. So use at your own peril.
       */
-     bool unsafe_stop();
+    bool unsafe_stop();
+    void setup_large_syscall_stack();
+    void free_tiny_syscall_stack();
 private:
     static void wake_impl(detached_state* st,
             unsigned allowed_initial_states_mask = 1 << 
unsigned(status::waiting));
-- 
2.7.4

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to osv-dev+unsubscr...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Reply via email to