On Thu, 2025-06-19 at 10:04 +0900, Hajime Tazaki wrote: > As userspace on UML/!MMU also need to configure %fs register when it is > running to correctly access thread structure, host syscalls implemented > in os-Linux drivers may be puzzled when they are called. Thus it has to > configure %fs register via arch_prctl(SET_FS) on every host syscalls.
Really, I still think that we should "just" get rid of libc entirely inside UML. That would avoid so many weird/potential issues … Doesn't change the fact that FS/GS needs to be restored when doing thread switches and such. Though one might be able to do it entirely within arch_switch_to then. Benjamin > > Signed-off-by: Hajime Tazaki <thehaj...@gmail.com> > Signed-off-by: Ricardo Koller <ricar...@google.com> > --- > arch/um/include/shared/os.h | 6 +++ > arch/um/os-Linux/process.c | 6 +++ > arch/um/os-Linux/start_up.c | 21 +++++++++ > arch/x86/um/nommu/do_syscall_64.c | 37 ++++++++++++++++ > arch/x86/um/nommu/syscalls_64.c | 71 +++++++++++++++++++++++++++++++ > 5 files changed, 141 insertions(+) > > diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h > index 1251f08e26d0..7c6a8bc0447c 100644 > --- a/arch/um/include/shared/os.h > +++ b/arch/um/include/shared/os.h > @@ -189,6 +189,7 @@ extern void check_host_supports_tls(int *supports_tls, > int *tls_min); > extern void get_host_cpu_features( > void (*flags_helper_func)(char *line), > void (*cache_helper_func)(char *line)); > +extern int host_has_fsgsbase; > > /* mem.c */ > extern int create_mem_file(unsigned long long len); > @@ -213,6 +214,11 @@ extern int os_protect_memory(void *addr, unsigned long > len, > extern int os_unmap_memory(void *addr, int len); > extern int os_drop_memory(void *addr, int length); > extern int can_drop_memory(void); > +extern int os_arch_prctl(int pid, int option, unsigned long *arg); > +#ifndef CONFIG_MMU > +extern long long host_fs; > +#endif > + > > void os_set_pdeathsig(void); > > diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c > index 8a1ab59a089f..3a6d34ccd12b 100644 > --- a/arch/um/os-Linux/process.c > +++ b/arch/um/os-Linux/process.c > @@ -16,6 +16,7 @@ > #include <sys/prctl.h> > #include <sys/wait.h> > #include <asm/unistd.h> > +#include <sys/syscall.h> /* For SYS_xxx definitions */ > #include <linux/threads.h> > #include <init.h> > #include <longjmp.h> > @@ -178,6 +179,11 @@ int __init can_drop_memory(void) > return ok; > } > > +int os_arch_prctl(int pid, int option, unsigned long *arg2) > +{ > + return syscall(SYS_arch_prctl, option, arg2); > +} > + > void init_new_thread_signals(void) > { > set_handler(SIGSEGV); > diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c > index 4e1f05360c49..55dd92bd2a0b 100644 > --- a/arch/um/os-Linux/start_up.c > +++ b/arch/um/os-Linux/start_up.c > @@ -20,6 +20,8 @@ > #include <sys/resource.h> > #include <asm/ldt.h> > #include <asm/unistd.h> > +#include <sys/auxv.h> > +#include <asm/hwcap2.h> > #include <init.h> > #include <os.h> > #include <kern_util.h> > @@ -36,6 +38,8 @@ > #include <skas.h> > #include "internal.h" > > +int host_has_fsgsbase; > + > static void ptrace_child(void) > { > int ret; > @@ -459,6 +463,20 @@ __uml_setup("seccomp=", uml_seccomp_config, > " This is insecure and should only be used with a trusted userspace\n\n" > ); > > +static void __init check_fsgsbase(void) > +{ > + unsigned long auxv = getauxval(AT_HWCAP2); > + > + os_info("Checking FSGSBASE instructions..."); > + if (auxv & HWCAP2_FSGSBASE) { > + host_has_fsgsbase = 1; > + os_info("OK\n"); > + } else { > + host_has_fsgsbase = 0; > + os_info("disabled\n"); > + } > +} > + > void __init os_early_checks(void) > { > int pid; > @@ -484,6 +502,9 @@ void __init os_early_checks(void) > using_seccomp = 0; > check_ptrace(); > > + /* probe fsgsbase instruction */ > + check_fsgsbase(); > + > pid = start_ptraced_child(); > if (init_pid_registers(pid)) > fatal("Failed to initialize default registers"); > diff --git a/arch/x86/um/nommu/do_syscall_64.c > b/arch/x86/um/nommu/do_syscall_64.c > index 5d0fa83e7fdc..796beb0089fc 100644 > --- a/arch/x86/um/nommu/do_syscall_64.c > +++ b/arch/x86/um/nommu/do_syscall_64.c > @@ -2,10 +2,38 @@ > > #include <linux/kernel.h> > #include <linux/ptrace.h> > +#include <asm/fsgsbase.h> > +#include <asm/prctl.h> > #include <kern_util.h> > #include <sysdep/syscalls.h> > #include <os.h> > > +static int os_x86_arch_prctl(int pid, int option, unsigned long *arg2) > +{ > + if (!host_has_fsgsbase) > + return os_arch_prctl(pid, option, arg2); > + > + switch (option) { > + case ARCH_SET_FS: > + wrfsbase(*arg2); > + break; > + case ARCH_SET_GS: > + wrgsbase(*arg2); > + break; > + case ARCH_GET_FS: > + *arg2 = rdfsbase(); > + break; > + case ARCH_GET_GS: > + *arg2 = rdgsbase(); > + break; > + default: > + pr_warn("%s: unsupported option: 0x%x", __func__, option); > + break; > + } > + > + return 0; > +} > + > __visible void do_syscall_64(struct pt_regs *regs) > { > int syscall; > @@ -17,6 +45,9 @@ __visible void do_syscall_64(struct pt_regs *regs) > syscall, (unsigned long)current, > (unsigned long)sys_call_table[syscall]); > > + /* set fs register to the original host one */ > + os_x86_arch_prctl(0, ARCH_SET_FS, (void *)host_fs); > + > if (likely(syscall < NR_syscalls)) { > PT_REGS_SET_SYSCALL_RETURN(regs, > EXECUTE_SYSCALL(syscall, regs)); > @@ -34,4 +65,10 @@ __visible void do_syscall_64(struct pt_regs *regs) > /* force do_signal() --> is_syscall() */ > set_thread_flag(TIF_SIGPENDING); > interrupt_end(); > + > + /* restore back fs register to userspace configured one */ > + os_x86_arch_prctl(0, ARCH_SET_FS, > + (void *)(current->thread.regs.regs.gp[FS_BASE > + / sizeof(unsigned long)])); > + > } > diff --git a/arch/x86/um/nommu/syscalls_64.c b/arch/x86/um/nommu/syscalls_64.c > index c78c442aed1d..5bb6d55b4bb5 100644 > --- a/arch/x86/um/nommu/syscalls_64.c > +++ b/arch/x86/um/nommu/syscalls_64.c > @@ -13,8 +13,70 @@ > #include <asm/prctl.h> /* XXX This should get the constants from libc */ > #include <registers.h> > #include <os.h> > +#include <asm/thread_info.h> > +#include <asm/mman.h> > #include "syscalls.h" > > +/* > + * The guest libc can change FS, which confuses the host libc. > + * In fact, changing FS directly is not supported (check > + * man arch_prctl). So, whenever we make a host syscall, > + * we should be changing FS to the original FS (not the > + * one set by the guest libc). This original FS is stored > + * in host_fs. > + */ > +long long host_fs = -1; > + > +long arch_prctl(struct task_struct *task, int option, > + unsigned long __user *arg2) > +{ > + long ret = -EINVAL; > + unsigned long *ptr = arg2, tmp; > + > + switch (option) { > + case ARCH_SET_FS: > + if (host_fs == -1) > + os_arch_prctl(0, ARCH_GET_FS, (void *)&host_fs); > + ret = 0; > + break; > + case ARCH_SET_GS: > + ret = 0; > + break; > + case ARCH_GET_FS: > + case ARCH_GET_GS: > + ptr = &tmp; > + break; > + } > + > + ret = os_arch_prctl(0, option, ptr); > + if (ret) > + return ret; > + > + switch (option) { > + case ARCH_SET_FS: > + current->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)] = > + (unsigned long) arg2; > + break; > + case ARCH_SET_GS: > + current->thread.regs.regs.gp[GS_BASE / sizeof(unsigned long)] = > + (unsigned long) arg2; > + break; > + case ARCH_GET_FS: > + ret = put_user(current->thread.regs.regs.gp[FS_BASE / > sizeof(unsigned long)], arg2); > + break; > + case ARCH_GET_GS: > + ret = put_user(current->thread.regs.regs.gp[GS_BASE / > sizeof(unsigned long)], arg2); > + break; > + } > + > + return ret; > +} > + > +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) > +{ > + return arch_prctl(current, option, (unsigned long __user *) arg2); > +} > + > void arch_switch_to(struct task_struct *to) > { > /* > @@ -42,3 +104,12 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, > len, > > return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); > } > + > +static int __init um_nommu_setup_hostfs(void) > +{ > + /* initialize the host_fs value at boottime */ > + os_arch_prctl(0, ARCH_GET_FS, (void *)&host_fs); > + > + return 0; > +} > +arch_initcall(um_nommu_setup_hostfs);