From: Waldemar Kozaczuk <jwkozac...@gmail.com>
Committer: WALDEMAR KOZACZUK <jwkozac...@gmail.com>
Branch: master

implement clone/3, set_robust_list and set_tid_address syscalls

This PR implements clone, clone3, set_robust_list, and set_tid_address
syscalls needed to support running multi-threaded static executables on
OSv.

Bulk of this patch is implementation of the clone and its clone3 variant.
More specifically the sys_clone() implements only the tiny subset of
what the Linux manual describes - handling of CLONE_THREAD - which
is what is used by glibc to implement pthread_create().

In essence, the sys_clone() creates new thread, sets application TCB
if present, and then when started new thread executes code implemented in 
assembly
to restore most of the registers and jump to the instruction where the
parent thread calling clone would execute next. So effectively a thread
calling the clone syscall would "clone" itself by creating new child thread
that resumes in the same place in code right after the syscall instruction
which is held in the RCX register. All the registers to be restored in the child
thread are copied from the frame of the parent thread syscall stack.
The detailed comments explaining the implementation of clone() can be found
intertwined with the code of sys_clone() in clone.cc.

This patch also implements two other related syscalls - set_robust_list and
set_tid_address - which are mostly described here - 
https://www.kernel.org/doc/Documentation/robust-futexes.txt.

With this patch following simple example compiled as a static
executable runs fine on OSv:

void* secondary(void *ignore)
{
    printf("secondary thread\n");
}

void main() {
    pthread_t threads[10];
    for (int i = 0; i < 10; i++)
       pthread_create(&threads[i], NULL, secondary, NULL);

    printf("Created 10 threads\n");

    for (int i = 0; i < 10; i++)
       pthread_join(threads[i], null);
    printf("Joined 10 threads\n");
}

Fixes #1139

Signed-off-by: Waldemar Kozaczuk <jwkozac...@gmail.com>

Reverse futex/clear_id cleanup

---
diff --git a/Makefile b/Makefile
--- a/Makefile
+++ b/Makefile
@@ -1013,6 +1013,7 @@ objects += arch/x64/apic.o
 objects += arch/x64/apic-clock.o
 objects += arch/x64/entry-xen.o
 objects += arch/x64/prctl.o
+objects += arch/x64/clone.o
 objects += arch/x64/vmlinux.o
 objects += arch/x64/vmlinux-boot64.o
 objects += arch/x64/pvh-boot.o
diff --git a/arch/x64/arch-switch.hh b/arch/x64/arch-switch.hh
--- a/arch/x64/arch-switch.hh
+++ b/arch/x64/arch-switch.hh
@@ -360,6 +360,11 @@ void thread::free_syscall_stack()
     }
 }
 
+void* thread::get_syscall_stack_top()
+{
+    return _state._syscall_stack_descriptor.stack_top;
+}
+
 void thread_main_c(thread* t)
 {
     arch::irq_enable();
diff --git a/arch/x64/clone.cc b/arch/x64/clone.cc
--- a/arch/x64/clone.cc
+++ b/arch/x64/clone.cc
@@ -0,0 +1,134 @@
+/*
+ * Copyright (C) 2023 Waldemar Kozaczuk
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+#include "arch.hh"
+#include <errno.h>
+#include <osv/sched.hh>
+#include "tls-switch.hh"
+
+#define CLONE_THREAD           0x00010000
+#define CLONE_SETTLS           0x00080000
+#define CLONE_CHILD_SETTID     0x01000000
+#define CLONE_PARENT_SETTID    0x00100000
+#define CLONE_CHILD_CLEARTID   0x00200000
+
+static constexpr size_t CHILD_FRAME_OFFSET = 136;
+static constexpr size_t PARENT_FRAME_OFFSET = 120;
+static constexpr size_t FRAME_SIZE = 120;
+static constexpr size_t RSP_OFFSET = 8;
+static constexpr size_t RAX_OFFSET = 16;
+
+int sys_clone(unsigned long flags, void *child_stack, int *ptid, int *ctid, 
unsigned long newtls)
+{   //
+    //We only support "cloning" of threads so fork() would fail but 
pthread_create() should
+    //succeed
+    if (!(flags & CLONE_THREAD)) {
+       errno = ENOSYS;
+       return -1;
+    }
+    //
+    //Validate we have non-empty stack
+    if (!child_stack) {
+       errno = EINVAL;
+       return -1;
+    }
+    //
+    //Validate ptid and ctid which we would be setting down if requested by 
these flags
+    if (((flags & CLONE_PARENT_SETTID) && !ptid) ||
+        ((flags & CLONE_CHILD_SETTID) && !ctid) ||
+        ((flags & CLONE_SETTLS) && !newtls)) {
+       errno = EFAULT;
+       return -1;
+    }
+    //
+    //If the parent thread is pinned we should make new thread inherit this
+    auto parent_pinned_cpu = sched::thread::current()->pinned() ? 
sched::cpu::current() : nullptr;
+    //
+    //Create new child thread
+    auto t = sched::thread::make([=] {
+       //
+       //Switch to app TCB if one specified
+       u64 app_tcb = sched::thread::current()->get_app_tcb();
+       if (app_tcb) {
+           arch::set_fsbase(app_tcb);
+       }
+       //
+       //Restore registers from the syscall stack and jump to the caller
+       //We are restoring the registers based on how they were saved
+       //on the syscall stack of the parent
+       const size_t frame_offset = CHILD_FRAME_OFFSET;
+       asm volatile
+         ("movq %%gs:0, %%rsp \n\t"  //Switch to syscall stack
+          "subq %0, %%rsp \n\t"      //Adjust stack pointer to the start of 
the frame
+          "popq %%r15 \n\t"
+          "popq %%r14 \n\t"
+          "popq %%r13 \n\t"
+          "popq %%r12 \n\t"
+          "popq %%r11 \n\t"
+          "popq %%r10 \n\t"
+          "popq %%r9  \n\t"
+          "popq %%r8  \n\t"
+          "popq %%rdi \n\t"
+          "popq %%rsi \n\t"
+          "popq %%rdx \n\t"
+          "popq %%rbx \n\t"
+          "addq $8, %%rsp \n\t"
+          "popq %%rbp \n\t"
+          "popq %%rcx \n\t"
+          "popq %%rax \n\t"
+          "pushq %%r11 \n\t"
+          "popfq \n\t"
+          "popq %%rsp \n\t"          //Pop user stack to become new stack
+          "jmpq *%%rcx \n\t"         //Jump to where the child thread should 
continue
+               : : "r"(frame_offset));
+    }, sched::thread::attr().
+        stack(4096 * 4). //16K kernel stack should be large enough
+        pin(parent_pinned_cpu),
+        false,
+        true);
+
+    //
+    //Store the child thread ID at the location pointed to by ptid
+    if ((flags & CLONE_PARENT_SETTID)) {
+       *ptid = t->id();
+    }
+    //
+    //Store the child thread ID at the location pointed to by ctid
+    if ((flags & CLONE_CHILD_SETTID)) {
+       *ctid = t->id();
+    }
+    //
+    //Clear (zero) the child thread ID at the location pointed to by child_tid
+    //in child memory when the child exits, and do a wakeup on the futex at 
that address
+    //See thread::complete()
+    if ((flags & CLONE_CHILD_CLEARTID)) {
+       t->set_clear_id(ctid);
+    }
+    //
+    //Copy all saved registers from parent syscall stack to the child syscall 
stack
+    //so that they can be restored in the child thread in the inlined assembly 
above
+    auto frame_start_on_child_syscall_stack = t->get_syscall_stack_top() - 
CHILD_FRAME_OFFSET;
+    auto frame_start_on_parent_syscall_stack = 
sched::thread::current()->get_syscall_stack_top() - PARENT_FRAME_OFFSET;
+    memcpy(frame_start_on_child_syscall_stack, 
frame_start_on_parent_syscall_stack, FRAME_SIZE);
+    //
+    //Save child stack pointer at the top of the frame so it will be restored 
last
+    *reinterpret_cast<u64*>(t->get_syscall_stack_top() - RSP_OFFSET) = 
reinterpret_cast<u64>(child_stack);
+    *reinterpret_cast<u64*>(t->get_syscall_stack_top() - RAX_OFFSET) = 0; 
//RAX needs to be zeroed per clone()
+    //
+    // Set app TCB if CLONE_SETTLS flag set
+    if ((flags & CLONE_SETTLS)) {
+       t->set_app_tcb(newtls);
+    }
+    t->start();
+    //
+    //The manual of sigprocmask has this to say about clone:
+    //"Each of the threads in a process has its own signal mask.
+    // A child created via fork(2) inherits a copy of its parent's
+    // signal mask; the signal mask is preserved across execve(2)."
+    //TODO: Does it mean new thread should inherit signal mask of the parent?
+    return t->id();
+}
diff --git a/core/sched.cc b/core/sched.cc
--- a/core/sched.cc
+++ b/core/sched.cc
@@ -36,6 +36,8 @@ MAKE_SYMBOL(sched::preempt);
 MAKE_SYMBOL(sched::preempt_disable);
 MAKE_SYMBOL(sched::preempt_enable);
 
+int futex(int *uaddr, int op, int val, const struct timespec *timeout, int 
*uaddr2, uint32_t val3);
+
 __thread char* percpu_base;
 
 extern char _percpu_start[], _percpu_end[];
@@ -1164,6 +1166,9 @@ thread::thread(std::function<void ()> func, attr attr, 
bool main, bool app)
     }
 
     _parent_id = s_current ? s_current->id() : 0;
+
+    _clear_id = nullptr;
+    _robust_list_head = nullptr;
 }
 
 static std::list<std::function<void ()>> exit_notifiers
@@ -1439,10 +1444,46 @@ void thread::stop_wait()
     assert(st.load() == status::running);
 }
 
+// See https://www.kernel.org/doc/Documentation/robust-futexes.txt
+#define FUTEX_OWNER_DIED       0x40000000
+#define FUTEX_KEY_ADDR(x, o)    ((int *)((u8 *)(x) + (o)))
+
 void thread::complete()
 {
     run_exit_notifiers();
 
+    //The logic below only applies when running statically
+    //linked executables or dynamically linked ones launched by the
+    //Linux dynamic linker. More specifically it gets triggered only
+    //when set_tid_address and set_robust_list get called
+
+    //See https://www.kernel.org/doc/Documentation/robust-futexes.txt for
+    //details on this Linux specific logic
+    if (_robust_list_head) {
+        robust_list_head *h = _robust_list_head;
+        robust_list *l;
+        int *uaddr;
+
+        if (h->list_op_pending) {
+            uaddr = FUTEX_KEY_ADDR(h->list_op_pending, h->futex_offset);
+            *uaddr |= FUTEX_OWNER_DIED;
+            futex(uaddr, FUTEX_WAKE, 1, nullptr, nullptr, 0);
+        }
+
+        for (l = &h->list; (void*)l != (void*)h; l = l->next) {
+            uaddr = FUTEX_KEY_ADDR(l, h->futex_offset);
+            *uaddr |= FUTEX_OWNER_DIED;
+            futex(uaddr, FUTEX_WAKE, 1, nullptr, nullptr, 0);
+        }
+    }
+
+    //For more details about clear_id read CLONE_CHILD_CLEARTID section
+    //of https://man7.org/linux/man-pages/man2/clone.2.html
+    if (_clear_id) {
+        *_clear_id = 0;
+        futex(_clear_id, FUTEX_WAKE, 1, nullptr, nullptr, 0);
+    }
+
     auto value = detach_state::attached;
     _detach_state.compare_exchange_strong(value, 
detach_state::attached_complete);
     if (value == detach_state::detached) {
diff --git a/include/osv/sched.hh b/include/osv/sched.hh
--- a/include/osv/sched.hh
+++ b/include/osv/sched.hh
@@ -31,6 +31,25 @@
 
 typedef float runtime_t;
 
+enum {
+    FUTEX_WAIT           = 0,
+    FUTEX_WAKE           = 1,
+    FUTEX_WAIT_BITSET    = 9,
+    FUTEX_PRIVATE_FLAG   = 128,
+    FUTEX_CLOCK_REALTIME = 256,
+    FUTEX_CMD_MASK       = ~(FUTEX_PRIVATE_FLAG|FUTEX_CLOCK_REALTIME),
+};
+
+struct robust_list {
+    struct robust_list *next;
+};
+
+struct robust_list_head {
+    struct robust_list list;
+    long futex_offset;
+    struct robust_list *list_op_pending;
+};
+
 extern "C" {
 void smp_main();
 #ifdef __aarch64__
@@ -692,6 +711,7 @@ public:
     bool unsafe_stop();
     void setup_large_syscall_stack();
     void free_tiny_syscall_stack();
+    void* get_syscall_stack_top();
 private:
     static void wake_impl(detached_state* st,
             unsigned allowed_initial_states_mask = 1 << 
unsigned(status::waiting));
@@ -737,6 +757,12 @@ public:
     {
         return _parent_id;
     }
+    void set_clear_id(int *clear_id) {
+        _clear_id = clear_id;
+    }
+    void set_robust_list(robust_list_head *list_head) {
+        _robust_list_head = list_head;
+    }
 private:
     virtual void timer_fired() override;
     struct detached_state;
@@ -882,6 +908,11 @@ private:
             osv::clock::uptime::time_point &running_since,
             osv::clock::uptime::duration &total_cpu_time);
     unsigned int _parent_id;
+    //These two variables are only meaningful when running statically
+    //linked executables or dynamically linked ones launched by the
+    //Linux dynamic linker
+    int *_clear_id;
+    robust_list_head *_robust_list_head;
 };
 
 class thread_handle {
diff --git a/linux.cc b/linux.cc
--- a/linux.cc
+++ b/linux.cc
@@ -72,14 +72,6 @@ extern "C" OSV_LIBC_API long gettid()
 // was missing. So the performance of this implementation is not critical.
 static std::unordered_map<void*, waitqueue> queues;
 static mutex queues_mutex;
-enum {
-    FUTEX_WAIT           = 0,
-    FUTEX_WAKE           = 1,
-    FUTEX_WAIT_BITSET    = 9,
-    FUTEX_PRIVATE_FLAG   = 128,
-    FUTEX_CLOCK_REALTIME = 256,
-    FUTEX_CMD_MASK       = ~(FUTEX_PRIVATE_FLAG|FUTEX_CLOCK_REALTIME),
-};
 
 #define FUTEX_BITSET_MATCH_ANY  0xffffffff
 
@@ -405,6 +397,47 @@ static long sys_getcpu(unsigned int *cpu, unsigned int 
*node, void *tcache)
     return 0;
 }
 
+#define __NR_sys_set_robust_list __NR_set_robust_list
+static long sys_set_robust_list(struct robust_list_head *head, size_t len)
+{
+    sched::thread::current()->set_robust_list(head);
+    return 0;
+}
+
+#define __NR_sys_set_tid_address __NR_set_tid_address
+static long sys_set_tid_address(int *tidptr)
+{
+    sched::thread::current()->set_clear_id(tidptr);
+    return sched::thread::current()->id();
+}
+
+#ifdef __x86_64__
+#define __NR_sys_clone __NR_clone
+extern int sys_clone(unsigned long flags, void *child_stack, int *ptid, int 
*ctid, unsigned long newtls);
+
+struct clone_args {
+     u64 flags;
+     u64 pidfd;
+     u64 child_tid;
+     u64 parent_tid;
+     u64 exit_signal;
+     u64 stack;
+     u64 stack_size;
+     u64 tls;
+};
+
+#define __NR_sys_clone3 435
+static int sys_clone3(struct clone_args *args, size_t size)
+{
+    return sys_clone(
+       args->flags,
+       reinterpret_cast<void*>(args->stack) + args->stack_size,
+       reinterpret_cast<int*>(args->parent_tid),
+       reinterpret_cast<int*>(args->child_tid),
+       args->tls);
+}
+#endif
+
 #define __NR_sys_ioctl __NR_ioctl
 //
 // We need to define explicit sys_ioctl that takes these 3 parameters to 
conform
@@ -659,6 +692,12 @@ OSV_LIBC_API long syscall(long number, ...)
     SYSCALL2(fchmod, int, mode_t);
 #ifdef __x86_64__
     SYSCALL2(arch_prctl, int, unsigned long);
+#endif
+    SYSCALL2(sys_set_robust_list, struct robust_list_head *, size_t);
+    SYSCALL1(sys_set_tid_address, int *);
+#ifdef __x86_64__
+    SYSCALL5(sys_clone, unsigned long, void *, int *, int *, unsigned long);
+    SYSCALL2(sys_clone3, struct clone_args *, size_t);
 #endif
     }
 

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to osv-dev+unsubscr...@googlegroups.com.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/osv-dev/000000000000f6d8fd06095668ed%40google.com.

Reply via email to