From: Waldemar Kozaczuk <[email protected]>
Committer: Waldemar Kozaczuk <[email protected]>
Branch: master

aarch64: implement clone/clone3 to run multi-threaded static apps

Just like the patch b3792dfa62149a0f8c5dd75d445dcf2266235de1, this one
implements clone/clone3 system call but on aarch64. For more details
please read the code comments.

In addition this patch refactors the clone code by extracting common
logic into the clone() funtion in linux.cc and leaving the arch-specific
code in clone_thread() found under arch/$(arch)/clone.cc

With this patch, one can run multi-threaded static executables and
dynamic ones with Linux dynamic linker on OSv on aarch64.

./scripts/test.py --linux_ld -m modules/tests-with-linux-ld/usr.manifest \
  -d java_no_wrapper \
  -d tst-chmod \
  -d tst-kill \
  -d tst-remove \
  -d tst-sigaction \
  -d tst-sigwait \
  -d tst-stdio-rofs \
  -d tst-wctype

Please note the java_wrapper crashes because of the missing implementation
of AT_SYMLINK_NOFOLLOW in faccessat(). And tst-chmod and tst-remove fail
because of missing fchmodat syscall.

Signed-off-by: Waldemar Kozaczuk <[email protected]>

---
diff --git a/Makefile b/Makefile
--- a/Makefile
+++ b/Makefile
@@ -977,6 +977,7 @@ objects += arch/$(arch)/cpuid.o
 objects += arch/$(arch)/firmware.o
 objects += arch/$(arch)/hypervisor.o
 objects += arch/$(arch)/interrupt.o
+objects += arch/$(arch)/clone.o
 ifeq ($(conf_drivers_pci),1)
 objects += arch/$(arch)/pci.o
 objects += arch/$(arch)/msi.o
@@ -1013,7 +1014,6 @@ objects += arch/x64/apic.o
 objects += arch/x64/apic-clock.o
 objects += arch/x64/entry-xen.o
 objects += arch/x64/prctl.o
-objects += arch/x64/clone.o
 objects += arch/x64/vmlinux.o
 objects += arch/x64/vmlinux-boot64.o
 objects += arch/x64/pvh-boot.o
diff --git a/arch/aarch64/clone.cc b/arch/aarch64/clone.cc
--- a/arch/aarch64/clone.cc
+++ b/arch/aarch64/clone.cc
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2023 Waldemar Kozaczuk
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+#include "arch.hh"
+#include <errno.h>
+#include <osv/sched.hh>
+
+#define CLONE_SETTLS           0x00080000
+
+static constexpr size_t CHILD_FRAME_OFFSET = 7*4096 + sizeof(exception_frame);
+static constexpr size_t PARENT_FRAME_OFFSET = sizeof(exception_frame);
+
+sched::thread *clone_thread(unsigned long flags, void *child_stack, unsigned 
long newtls)
+{   //
+    //If the parent thread is pinned we should make new thread inherit this
+    auto parent_pinned_cpu = sched::thread::current()->pinned() ? 
sched::cpu::current() : nullptr;
+    //
+    //Create new child thread
+    auto t = sched::thread::make([=] {
+       //
+       //Switch to app TCB if one specified
+       auto frame_start_on_exception_stack = 
sched::thread::current()->get_exception_stack_top() - CHILD_FRAME_OFFSET;
+       exception_frame *child_frame = 
reinterpret_cast<exception_frame*>(frame_start_on_exception_stack);
+       if (child_frame->far) {
+           asm volatile ("msr tpidr_el0, %0; isb; " :: "r"(child_frame->far) : 
"memory");
+       }
+       //
+       //Restore registers from the exception stack and jump to the caller
+       //We are restoring the registers based on how they were saved
+       //on the exception stack of the parent
+       asm volatile
+         ("msr daifset, #2 \n\t"          // Disable interrupts
+          "isb \n\t"
+          "mov sp, %0 \n\t"               // Set child stack
+          "msr spsel, #0 \n\t"            // Switch to exception stack
+          "mov sp, %1 \n\t"               // Set stack to the beginning of the 
stack frame
+          "ldr x30, [sp, #256] \n\t"      // Load x30 (link register) with 
elr_el1 (exception link register)
+          "ldp x0, x1, [sp], #16 \n\t"
+          "ldp x2, x3, [sp], #16 \n\t"
+          "ldp x4, x5, [sp], #16 \n\t"
+          "ldp x6, x7, [sp], #16 \n\t"
+          "ldp x8, x9, [sp], #16 \n\t"
+          "ldp x10, x11, [sp], #16 \n\t"
+          "ldp x12, x13, [sp], #16 \n\t"
+          "ldp x14, x15, [sp], #16 \n\t"
+          "ldp x16, x17, [sp], #16 \n\t"
+          "ldp x18, x19, [sp], #16 \n\t"
+          "ldp x20, x21, [sp], #16 \n\t"
+          "ldp x22, x23, [sp], #16 \n\t"
+          "ldp x24, x25, [sp], #16 \n\t"
+          "ldp x26, x27, [sp], #16 \n\t"
+          "ldp x28, x29, [sp], #16 \n\t"
+          "add sp, sp, #48 \n\t"
+          "add sp, sp, #28672 \n\t"       // Move back 7*4096
+          "msr spsel, #1 \n\t"            // Switch to user stack
+          "msr daifclr, #2 \n\t"          // Enable interrupts
+          "isb \n\t" : : "r"(child_frame->sp), 
"r"(frame_start_on_exception_stack));
+    }, sched::thread::attr().
+        stack(4096 * 4). //16K kernel stack should be large enough
+        pin(parent_pinned_cpu),
+        false,
+        true);
+    //
+    //Copy all saved registers from parent exception stack to the child 
exception stack
+    //so that they can be restored in the child thread in the inlined assembly 
above
+    auto frame_start_on_child_exception_stack = t->get_exception_stack_top() - 
CHILD_FRAME_OFFSET;
+    exception_frame *child_frame = 
reinterpret_cast<exception_frame*>(frame_start_on_child_exception_stack);
+    auto frame_start_on_parent_exception_stack = 
sched::thread::current()->get_exception_stack_top() - PARENT_FRAME_OFFSET;
+    exception_frame *parent_frame = 
reinterpret_cast<exception_frame*>(frame_start_on_parent_exception_stack);
+    memcpy(child_frame, parent_frame, sizeof(*parent_frame));
+    //
+    // Save child stack pointer
+    child_frame->sp = reinterpret_cast<u64>(child_stack);
+    child_frame->regs[0] = 0;
+    //
+    // Set app TCB if CLONE_SETTLS flag set
+    if ((flags & CLONE_SETTLS)) {
+       child_frame->far = newtls;
+    } else {
+       child_frame->far = 0;
+    }
+
+    return t;
+}
diff --git a/arch/x64/clone.cc b/arch/x64/clone.cc
--- a/arch/x64/clone.cc
+++ b/arch/x64/clone.cc
@@ -10,40 +10,16 @@
 #include <osv/sched.hh>
 #include "tls-switch.hh"
 
-#define CLONE_THREAD           0x00010000
 #define CLONE_SETTLS           0x00080000
-#define CLONE_CHILD_SETTID     0x01000000
-#define CLONE_PARENT_SETTID    0x00100000
-#define CLONE_CHILD_CLEARTID   0x00200000
 
 static constexpr size_t CHILD_FRAME_OFFSET = 136;
 static constexpr size_t PARENT_FRAME_OFFSET = 120;
 static constexpr size_t FRAME_SIZE = 120;
 static constexpr size_t RSP_OFFSET = 8;
 static constexpr size_t RAX_OFFSET = 16;
 
-int sys_clone(unsigned long flags, void *child_stack, int *ptid, int *ctid, 
unsigned long newtls)
+sched::thread *clone_thread(unsigned long flags, void *child_stack, unsigned 
long newtls)
 {   //
-    //We only support "cloning" of threads so fork() would fail but 
pthread_create() should
-    //succeed
-    if (!(flags & CLONE_THREAD)) {
-       errno = ENOSYS;
-       return -1;
-    }
-    //
-    //Validate we have non-empty stack
-    if (!child_stack) {
-       errno = EINVAL;
-       return -1;
-    }
-    //
-    //Validate ptid and ctid which we would be setting down if requested by 
these flags
-    if (((flags & CLONE_PARENT_SETTID) && !ptid) ||
-        ((flags & CLONE_CHILD_SETTID) && !ctid) ||
-        ((flags & CLONE_SETTLS) && !newtls)) {
-       errno = EFAULT;
-       return -1;
-    }
     //
     //If the parent thread is pinned we should make new thread inherit this
     auto parent_pinned_cpu = sched::thread::current()->pinned() ? 
sched::cpu::current() : nullptr;
@@ -91,23 +67,6 @@ int sys_clone(unsigned long flags, void *child_stack, int 
*ptid, int *ctid, unsi
         false,
         true);
 
-    //
-    //Store the child thread ID at the location pointed to by ptid
-    if ((flags & CLONE_PARENT_SETTID)) {
-       *ptid = t->id();
-    }
-    //
-    //Store the child thread ID at the location pointed to by ctid
-    if ((flags & CLONE_CHILD_SETTID)) {
-       *ctid = t->id();
-    }
-    //
-    //Clear (zero) the child thread ID at the location pointed to by child_tid
-    //in child memory when the child exits, and do a wakeup on the futex at 
that address
-    //See thread::complete()
-    if ((flags & CLONE_CHILD_CLEARTID)) {
-       t->set_clear_id(ctid);
-    }
     //
     //Copy all saved registers from parent syscall stack to the child syscall 
stack
     //so that they can be restored in the child thread in the inlined assembly 
above
@@ -123,12 +82,6 @@ int sys_clone(unsigned long flags, void *child_stack, int 
*ptid, int *ctid, unsi
     if ((flags & CLONE_SETTLS)) {
        t->set_app_tcb(newtls);
     }
-    t->start();
-    //
-    //The manual of sigprocmask has this to say about clone:
-    //"Each of the threads in a process has its own signal mask.
-    // A child created via fork(2) inherits a copy of its parent's
-    // signal mask; the signal mask is preserved across execve(2)."
-    //TODO: Does it mean new thread should inherit signal mask of the parent?
-    return t->id();
+
+    return t;
 }
diff --git a/core/elf.cc b/core/elf.cc
--- a/core/elf.cc
+++ b/core/elf.cc
@@ -535,9 +535,6 @@ void object::process_headers()
             abort("Unknown p_type in executable %s: %d\n", pathname(), 
phdr.p_type);
         }
     }
-    if (!is_core() && is_statically_linked_executable()) {
-        std::cout << "WARNING: Statically linked executables are only 
supported to limited extent!\n";
-    }
     if (_is_dynamically_linked_executable && _tls_segment) {
         auto app_tls_size = get_aligned_tls_size();
         ulong pie_static_tls_maximum_size = &_pie_static_tls_end - 
&_pie_static_tls_start;
diff --git a/include/osv/sched.hh b/include/osv/sched.hh
--- a/include/osv/sched.hh
+++ b/include/osv/sched.hh
@@ -711,7 +711,10 @@ public:
     bool unsafe_stop();
     void setup_large_syscall_stack();
     void free_tiny_syscall_stack();
+#ifdef __x86_64__
     void* get_syscall_stack_top();
+#endif
+    void* get_exception_stack_top() { return _arch.exception_stack + 
sizeof(_arch.exception_stack); }
 private:
     static void wake_impl(detached_state* st,
             unsigned allowed_initial_states_mask = 1 << 
unsigned(status::waiting));
diff --git a/linux.cc b/linux.cc
--- a/linux.cc
+++ b/linux.cc
@@ -441,9 +441,71 @@ static long sys_set_tid_address(int *tidptr)
     return sched::thread::current()->id();
 }
 
-#ifdef __x86_64__
+#define CLONE_THREAD           0x00010000
+#define CLONE_CHILD_SETTID     0x01000000
+#define CLONE_PARENT_SETTID    0x00100000
+#define CLONE_CHILD_CLEARTID   0x00200000
+
+extern sched::thread *clone_thread(unsigned long flags, void *child_stack, 
unsigned long newtls);
+
 #define __NR_sys_clone __NR_clone
-extern int sys_clone(unsigned long flags, void *child_stack, int *ptid, int 
*ctid, unsigned long newtls);
+#ifdef __x86_64__
+int sys_clone(unsigned long flags, void *child_stack, int *ptid, int *ctid, 
unsigned long newtls)
+#endif
+#ifdef __aarch64__
+int sys_clone(unsigned long flags, void *child_stack, int *ptid, unsigned long 
newtls, int *ctid)
+#endif
+{   //
+    //We only support "cloning" of threads so fork() would fail but 
pthread_create() should
+    //succeed
+    if (!(flags & CLONE_THREAD)) {
+       errno = ENOSYS;
+       return -1;
+    }
+    //
+    //Validate we have non-empty stack
+    if (!child_stack) {
+       errno = EINVAL;
+       return -1;
+    }
+    //
+    //Validate ptid and ctid which we would be setting down if requested by 
these flags
+    if (((flags & CLONE_PARENT_SETTID) && !ptid) ||
+        ((flags & CLONE_CHILD_SETTID) && !ctid) ||
+        ((flags & CLONE_SETTLS) && !newtls)) {
+       errno = EFAULT;
+       return -1;
+    }
+
+    sched::thread *t = clone_thread(flags, child_stack, newtls);
+
+    //
+    //Store the child thread ID at the location pointed to by ptid
+    if ((flags & CLONE_PARENT_SETTID)) {
+       *ptid = t->id();
+    }
+    //
+    //Store the child thread ID at the location pointed to by ctid
+    if ((flags & CLONE_CHILD_SETTID)) {
+       *ctid = t->id();
+    }
+    //
+    //Clear (zero) the child thread ID at the location pointed to by child_tid
+    //in child memory when the child exits, and do a wakeup on the futex at 
that address
+    //See thread::complete()
+    if ((flags & CLONE_CHILD_CLEARTID)) {
+       t->set_clear_id(ctid);
+    }
+    t->start();
+
+    //
+    //The manual of sigprocmask has this to say about clone:
+    //"Each of the threads in a process has its own signal mask.
+    // A child created via fork(2) inherits a copy of its parent's
+    // signal mask; the signal mask is preserved across execve(2)."
+    //TODO: Does it mean new thread should inherit signal mask of the parent?
+    return t->id();
+}
 
 struct clone_args {
      u64 flags;
@@ -463,10 +525,15 @@ static int sys_clone3(struct clone_args *args, size_t 
size)
        args->flags,
        reinterpret_cast<void*>(args->stack) + args->stack_size,
        reinterpret_cast<int*>(args->parent_tid),
+#ifdef __x86_64__
        reinterpret_cast<int*>(args->child_tid),
        args->tls);
-}
 #endif
+#ifdef __aarch64__
+       args->tls,
+       reinterpret_cast<int*>(args->child_tid));
+#endif
+}
 
 #define __NR_sys_ioctl __NR_ioctl
 //
@@ -592,7 +659,7 @@ extern int utimensat4(int dirfd, const char *pathname, 
const struct timespec tim
 TRACEPOINT(trace_syscall_open, "%d <= \"%s\" 0x%x", int, const char *, int);
 #endif
 TRACEPOINT(trace_syscall_read, "0x%x <= %d %p 0x%x", ssize_t, int, char *, 
size_t);
-TRACEPOINT(trace_syscall_uname, "%d <= ", int, struct utsname *);
+TRACEPOINT(trace_syscall_uname, "%d <= %p", int, struct utsname *);
 TRACEPOINT(trace_syscall_write, "0x%x <= %d %p 0x%x", ssize_t, int, const void 
*, size_t);
 TRACEPOINT(trace_syscall_gettid, "%d <=", pid_t);
 TRACEPOINT(trace_syscall_clock_gettime, "%d <= %d %p", int, clockid_t, struct 
timespec *);
@@ -654,7 +721,7 @@ TRACEPOINT(trace_syscall_nanosleep, "%d <= %p %p", int, 
const struct timespec*,
 TRACEPOINT(trace_syscall_fstatat, "%d <= %d \"%s\" %p 0%0o", int, int, const 
char *, struct stat *, int);
 TRACEPOINT(trace_syscall_sys_exit_group, "%d <= %d", int, int);
 TRACEPOINT(trace_syscall_sys_getcwd, "%ld <= 0%0o %lu", long, char *, unsigned 
long);
-TRACEPOINT(trace_syscall_readlinkat, "%lu <= %d 0%0o 0x%x %lu", ssize_t, int, 
const char *, char *, size_t);
+TRACEPOINT(trace_syscall_readlinkat, "%lu <= %d %s 0x%x %lu", ssize_t, int, 
const char *, char *, size_t);
 TRACEPOINT(trace_syscall_getpid, "%d <=", pid_t);
 TRACEPOINT(trace_syscall_set_mempolicy, "%ld <= %d %p %lu", long, int, 
unsigned long *, unsigned long);
 TRACEPOINT(trace_syscall_sys_sched_setaffinity, "%d <= %d %u %p", int, pid_t, 
unsigned, unsigned long *);
@@ -726,8 +793,11 @@ TRACEPOINT(trace_syscall_sys_set_robust_list, "%d <= %p 
%lu", long, struct robus
 TRACEPOINT(trace_syscall_sys_set_tid_address, "%d <= %p", long, int *);
 #ifdef __x86_64__
 TRACEPOINT(trace_syscall_sys_clone, "%d <= 0x%x 0x%x %p %p %lu", int, unsigned 
long, void *, int *, int *, unsigned long);
-TRACEPOINT(trace_syscall_sys_clone3, "%d <= %p %lu", int, struct clone_args *, 
size_t);
 #endif
+#ifdef __aarch64__
+TRACEPOINT(trace_syscall_sys_clone, "%d <= 0x%x 0x%x %p %p %lu", int, unsigned 
long, void *, int *, unsigned long, int *);
+#endif
+TRACEPOINT(trace_syscall_sys_clone3, "%d <= %p %lu", int, struct clone_args *, 
size_t);
 TRACEPOINT(trace_syscall_prlimit64, "%d <= %u %d %p %p", int, pid_t, int, 
const struct rlimit *, struct rlimit *);
 TRACEPOINT(trace_syscall_msync, "%d <= 0x%x %lu %d", int, void *, size_t, int);
 TRACEPOINT(trace_syscall_truncate, "%d <= %s %ld", int, const char *, off_t);
@@ -739,6 +809,7 @@ TRACEPOINT(trace_syscall_rt_sigtimedwait, "%d <= %p %p %p 
%lu", int, const sigse
 TRACEPOINT(trace_syscall_getrlimit, "%d <= %d %p", int, int, struct rlimit *);
 TRACEPOINT(trace_syscall_getpriority, "%d <= %d %d", int, int, int);
 TRACEPOINT(trace_syscall_setpriority, "%d <= %d %d %d", int, int, int, int);
+TRACEPOINT(trace_syscall_ppoll, "%d <= %p %ld %p %p", int, struct pollfd *, 
nfds_t, const struct timespec *, const sigset_t *);
 
 OSV_LIBC_API long syscall(long number, ...)
 {
@@ -885,8 +956,11 @@ OSV_LIBC_API long syscall(long number, ...)
     SYSCALL1(sys_set_tid_address, int *);
 #ifdef __x86_64__
     SYSCALL5(sys_clone, unsigned long, void *, int *, int *, unsigned long);
-    SYSCALL2(sys_clone3, struct clone_args *, size_t);
 #endif
+#ifdef __aarch64__
+    SYSCALL5(sys_clone, unsigned long, void *, int *, unsigned long, int *);
+#endif
+    SYSCALL2(sys_clone3, struct clone_args *, size_t);
     SYSCALL4(prlimit64, pid_t, int, const struct rlimit *, struct rlimit *);
     SYSCALL3(msync, void *, size_t, int);
     SYSCALL2(truncate, const char *, off_t);
@@ -898,6 +972,7 @@ OSV_LIBC_API long syscall(long number, ...)
     SYSCALL2(getrlimit, int, struct rlimit *);
     SYSCALL2(getpriority, int, int);
     SYSCALL3(setpriority, int, int, int);
+    SYSCALL4(ppoll, struct pollfd *, nfds_t, const struct timespec *, const 
sigset_t *);
     }
 
     debug_always("syscall(): unimplemented system call %d\n", number);

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msgid/osv-dev/0000000000001fe5f0060d30930c%40google.com.

Reply via email to