Applied, thanks!

Damien Zammit, le dim. 18 janv. 2026 03:47:07 +0000, a ecrit:
> This is difficult to get correct: we add r12 register to interrupt state
> restore so that every interrupt context knows which mode to return to
> for swapgs.  Testing if gs base is negative is enough to know which mode
> to enter from, since user cannot set gs base negative currently.
> (That would require FSGSBASE cpu feature enabled, but for now we leave
> it disabled).
> syscall64 enters with interrupts disabled and requires swapgs at
> beginning and end, as it is known to be called only from userspace.
> 
> ---
>  i386/i386/gdt.c    | 18 +++++++++
>  i386/i386/ldt.c    |  2 +-
>  i386/i386/pcb.c    |  8 ++--
>  i386/i386/thread.h |  1 +
>  x86_64/boothdr.S   |  2 -
>  x86_64/locore.S    | 92 ++++++++++++++++++++++++++++++++++++----------
>  6 files changed, 98 insertions(+), 25 deletions(-)
> 
> diff --git a/i386/i386/gdt.c b/i386/i386/gdt.c
> index 8c4a59c7..585583b7 100644
> --- a/i386/i386/gdt.c
> +++ b/i386/i386/gdt.c
> @@ -40,6 +40,7 @@
>  
>  #include "vm_param.h"
>  #include "seg.h"
> +#include "msr.h"
>  #include "gdt.h"
>  #include "mp_desc.h"
>  
> @@ -109,6 +110,17 @@ gdt_fill(int cpu, struct real_descriptor *mygdt)
>  #endif       /* MACH_PV_DESCRIPTORS */
>  }
>  
> +#ifdef __x86_64__
> +static void
> +reload_gs_base(int cpu)
> +{
> +     /* KGSBASE is kernels gs base while in userspace,
> +      * but when in kernel, GSBASE must point to percpu area. */
> +     wrmsr(MSR_REG_GSBASE, (uint64_t)&percpu_array[cpu]);
> +     wrmsr(MSR_REG_KGSBASE, 0);
> +}
> +#endif
> +
>  static void
>  reload_segs(void)
>  {
> @@ -138,6 +150,9 @@ gdt_init(void)
>       gdt_fill(0, gdt);
>  
>       reload_segs();
> +#ifdef __x86_64__
> +     reload_gs_base(0);
> +#endif
>  
>  #ifdef       MACH_PV_PAGETABLES
>  #if VM_MIN_KERNEL_ADDRESS != LINEAR_MIN_KERNEL_ADDRESS
> @@ -157,5 +172,8 @@ ap_gdt_init(int cpu)
>       gdt_fill(cpu, mp_gdt[cpu]);
>  
>       reload_segs();
> +#ifdef __x86_64__
> +     reload_gs_base(cpu);
> +#endif
>  }
>  #endif
> diff --git a/i386/i386/ldt.c b/i386/i386/ldt.c
> index 7db67f99..6a13a7f8 100644
> --- a/i386/i386/ldt.c
> +++ b/i386/i386/ldt.c
> @@ -72,7 +72,7 @@ ldt_fill(struct real_descriptor *myldt, struct 
> real_descriptor *mygdt)
>  #if defined(__x86_64__) && ! defined(USER32)
>          if (!CPU_HAS_FEATURE(CPU_FEATURE_SEP))
>              panic("syscall support is missing on 64 bit");
> -        /* Enable 64-bit syscalls */
> +        /* Enable 64-bit syscalls with interrupts disabled on entry */
>          wrmsr(MSR_REG_EFER, rdmsr(MSR_REG_EFER) | MSR_EFER_SCE);
>          wrmsr(MSR_REG_LSTAR, (vm_offset_t)syscall64);
>          wrmsr(MSR_REG_STAR, ((((long)USER_CS - 16) << 16) | (long)KERNEL_CS) 
> << 32);
> diff --git a/i386/i386/pcb.c b/i386/i386/pcb.c
> index d845b2b2..e4ac2bb9 100644
> --- a/i386/i386/pcb.c
> +++ b/i386/i386/pcb.c
> @@ -230,7 +230,7 @@ void switch_ktss(pcb_t pcb)
>  
>  #if defined(__x86_64__) && !defined(USER32)
>       wrmsr(MSR_REG_FSBASE, pcb->ims.sbs.fsbase);
> -     wrmsr(MSR_REG_GSBASE, pcb->ims.sbs.gsbase);
> +     wrmsr(MSR_REG_KGSBASE, pcb->ims.sbs.gsbase);
>  #endif
>  
>       db_load_context(pcb);
> @@ -710,11 +710,13 @@ kern_return_t thread_setstatus(
>                              return KERN_INVALID_ARGUMENT;
>  
>                      state = (struct i386_fsgs_base_state *) tstate;
> +                    if (state->gs_base & 0x8000000000000000UL)
> +                            printf("WARNING: negative gs base not 
> allowed\n");
>                      thread->pcb->ims.sbs.fsbase = state->fs_base;
> -                    thread->pcb->ims.sbs.gsbase = state->gs_base;
> +                    thread->pcb->ims.sbs.gsbase = state->gs_base & 
> 0x7fffffffffffffffUL;
>                      if (thread == current_thread()) {
>                              wrmsr(MSR_REG_FSBASE, state->fs_base);
> -                            wrmsr(MSR_REG_GSBASE, state->gs_base);
> +                            wrmsr(MSR_REG_KGSBASE, state->gs_base);
>                      }
>                      break;
>              }
> diff --git a/i386/i386/thread.h b/i386/i386/thread.h
> index 9c88d09a..5112bc83 100644
> --- a/i386/i386/thread.h
> +++ b/i386/i386/thread.h
> @@ -183,6 +183,7 @@ struct i386_interrupt_state {
>       long    ds;
>  #endif
>  #ifdef __x86_64__
> +     long    r12;
>       long    r11;
>       long    r10;
>       long    r9;
> diff --git a/x86_64/boothdr.S b/x86_64/boothdr.S
> index 45d59c06..98f1cab2 100644
> --- a/x86_64/boothdr.S
> +++ b/x86_64/boothdr.S
> @@ -185,7 +185,6 @@ boot_entry64:
>       andq    $(~15),%rax
>       movq    %rax,%rsp
>  
> -#if NCPUS > 1
>       /* Set GS base address for kernel */
>       movq    $percpu_array, %rdx
>       movl    %edx, %eax
> @@ -198,7 +197,6 @@ boot_entry64:
>       xorl    %edx, %edx
>       movl    $MSR_REG_KGSBASE, %ecx
>       wrmsr
> -#endif
>  
>       /* Reset EFLAGS to a known state.  */
>       pushq   $0
> diff --git a/x86_64/locore.S b/x86_64/locore.S
> index 6afda87a..907502ef 100644
> --- a/x86_64/locore.S
> +++ b/x86_64/locore.S
> @@ -88,7 +88,8 @@
>       pushq   %r8     ;\
>       pushq   %r9     ;\
>       pushq   %r10     ;\
> -     pushq   %r11
> +     pushq   %r11    ;\
> +     pushq   %r12
>  
>  #define PUSH_AREGS_ISR       \
>       pushq   %rax    ;\
> @@ -96,6 +97,7 @@
>  
>  
>  #define POP_REGS_ISR \
> +     popq    %r12    ;\
>       popq    %r11    ;\
>       popq    %r10     ;\
>       popq    %r9     ;\
> @@ -163,21 +165,54 @@
>  #define POP_SEGMENTS_ISR(reg)
>  #endif
>  
> -#if NCPUS > 1
> -#define SET_KERNEL_SEGMENTS(reg)                \
> -     ud2             /* TODO: use swapgs or similar */
> -#else // NCPUS > 1
>  #ifdef USER32
> -#define SET_KERNEL_SEGMENTS(reg)            \
> -     mov     %ss,reg /* switch to kernel segments */ ;\
> -     mov     reg,%ds /* (same as kernel stack segment) */ ;\
> -     mov     reg,%es                 ;\
> -     mov     reg,%fs                 ;\
> -     mov     reg,%gs
> -#else // USER32
> +#define SET_KERNEL_SEGMENTS(reg)     \
> +       mov   %ss,reg /* switch to kernel segments */ ;\
> +       mov   reg,%ds /* (same as kernel stack segment) */ ;\
> +       mov   reg,%es                 ;\
> +       mov   reg,%fs                 ;\
> +       mov   reg,%gs
> +#else
>  #define SET_KERNEL_SEGMENTS(reg)
> -#endif // USER32
> -#endif // NCPUS > 1
> +#endif
> +
> +#define RETURN_TO_KERN       0x7eadbeef
> +#define RETURN_TO_USER       0x66666666
> +
> +#ifdef USER32
> +# define SWAPGS_ENTRY_IF_NEEDED_R12
> +# define SWAPGS_EXIT_IF_NEEDED_R12
> +#else
> +/* Keeps %r12 (callee-saved) value throughout interrupt context */
> +# define SWAPGS_ENTRY_IF_NEEDED_R12  \
> +     pushf                           ;\
> +     cli                             ;\
> +     pushq   %rax                    ;\
> +     pushq   %rcx                    ;\
> +     pushq   %rdx                    ;\
> +     movl    $MSR_REG_GSBASE, %ecx   ;\
> +     rdmsr                           ;\
> +     testl   %edx, %edx      /* gs base sign bit set ? */ ;\
> +     js      0f              /* yes, dont swap then return to kernel mode */ 
> ;\
> +     swapgs                  /* no, swap then return to user mode */ ;\
> +     movq    $RETURN_TO_USER, %r12   ;\
> +     jmp     1f                      ;\
> +0:   movq    $RETURN_TO_KERN, %r12   ;\
> +1:   popq    %rdx                    ;\
> +     popq    %rcx                    ;\
> +     popq    %rax                    ;\
> +     popf
> +
> +# define SWAPGS_EXIT_IF_NEEDED_R12   \
> +     cmpq    $RETURN_TO_USER, %r12   ;\
> +     je      0f              /* return to user with swap */ ;\
> +     cmpq    $RETURN_TO_KERN, %r12   ;\
> +     je      1f              /* return to kern without swap */ ;\
> +     ud2                     /* or die */ ;\
> +0:   swapgs                          ;\
> +1:
> +
> +#endif
>  
>  /*
>   * Fault recovery.
> @@ -617,6 +652,7 @@ ENTRY(alltraps)
>       pusha                           /* save the general registers */
>  trap_push_segs:
>       PUSH_SEGMENTS(%rax)             /* and the segment registers */
> +     SWAPGS_ENTRY_IF_NEEDED_R12
>       SET_KERNEL_SEGMENTS(%rax)       /* switch to kernel data segment */
>  trap_set_segs:
>       cld                             /* clear direction flag */
> @@ -673,6 +709,7 @@ _return_to_user:
>   */
>  
>  _return_from_kernel:
> +     SWAPGS_EXIT_IF_NEEDED_R12
>  #ifdef USER32
>  _kret_popl_gs:
>       popq    %gs                     /* restore segment registers */
> @@ -738,6 +775,7 @@ ENTRY(thread_bootstrap_return)
>       movq    %rsp,%rcx                       /* get kernel stack */
>       or      $(KERNEL_STACK_SIZE-1),%rcx
>       movq    -7-IKS_SIZE(%rcx),%rsp          /* switch back to PCB stack */
> +     movq    $RETURN_TO_USER, %r12
>       jmp     _return_from_trap
>  
>  /*
> @@ -752,6 +790,7 @@ ENTRY(thread_syscall_return)
>       or      $(KERNEL_STACK_SIZE-1),%rcx
>       movq    -7-IKS_SIZE(%rcx),%rsp          /* switch back to PCB stack */
>       movq    %rax,R_EAX(%rsp)                /* save return value */
> +     movq    $RETURN_TO_USER, %r12
>       jmp     _return_from_trap
>  
>  ENTRY(call_continuation)
> @@ -829,8 +868,8 @@ INTERRUPT(255)
>  ENTRY(all_intrs)
>       PUSH_REGS_ISR                   /* save registers */
>       cld                             /* clear direction flag */
> -
>       PUSH_SEGMENTS_ISR(%rdx)         /* save segment registers */
> +     SWAPGS_ENTRY_IF_NEEDED_R12
>  
>       CPU_NUMBER_NO_GS(%rcx)
>       movq    %rsp,%rdx               /* on an interrupt stack? */
> @@ -887,6 +926,7 @@ LEXT(return_to_iret)                      /* to find the 
> return from calling interrupt) */
>       cmpq    $0,CX(EXT(need_ast),%rdx)
>       jnz     ast_from_interrupt      /* take it if so */
>  1:
> +     SWAPGS_EXIT_IF_NEEDED_R12
>       POP_SEGMENTS_ISR(%rdx)          /* restore segment regs */
>       POP_AREGS_ISR                   /* restore registers */
>  
> @@ -898,10 +938,10 @@ int_from_intstack:
>       jb      stack_overflowed        /* if not: */
>       call    EXT(interrupt)          /* call interrupt routine */
>  _return_to_iret_i:                   /* ( label for kdb_kintr) */
> +     SWAPGS_EXIT_IF_NEEDED_R12
>       POP_SEGMENTS_ISR(%rdx)
>       POP_AREGS_ISR                   /* restore registers */
>                                       /* no ASTs */
> -
>       iretq
>  
>  stack_overflowed:
> @@ -951,6 +991,7 @@ ast_from_interrupt:
>   *           saved SPL
>   *           saved IRQ
>   *           return address == return_to_iret_i
> + *           saved %r12
>   *           saved %r11
>   *           saved %r10
>   *           saved %r9
> @@ -973,6 +1014,7 @@ ast_from_interrupt:
>   *           saved %fs
>   *           saved %es
>   *           saved %ds
> + *           saved %r12
>   *           saved %r11
>   *           saved %r10
>   *           saved %r9
> @@ -1168,6 +1210,7 @@ syscall_entry_2:
>  
>       pusha                           /* save the general registers */
>       PUSH_SEGMENTS(%rdx)             /* and the segment registers */
> +     SWAPGS_ENTRY_IF_NEEDED_R12
>       SET_KERNEL_SEGMENTS(%rdx)       /* switch to kernel data segment */
>  
>  /*
> @@ -1303,6 +1346,7 @@ mach_call_addr:
>                                       /* set page-fault trap */
>       movq    $(T_PF_USER),R_ERR(%rbx)
>                                       /* set error code - read user space */
> +     movq    $RETURN_TO_USER, %r12
>       jmp     _take_trap              /* treat as a trap */
>  
>  /*
> @@ -1313,6 +1357,7 @@ mach_call_range:
>       movq    $(T_INVALID_OPCODE),R_TRAPNO(%rbx)
>                                       /* set invalid-operation trap */
>       movq    $0,R_ERR(%rbx)          /* clear error code */
> +     movq    $RETURN_TO_USER, %r12
>       jmp     _take_trap              /* treat as a trap */
>  
>  /*
> @@ -1356,6 +1401,7 @@ syscall_addr:
>                                       /* set page-fault trap */
>       movq    $(T_PF_USER),R_ERR(%rbx)
>                                       /* set error code - read user space */
> +     movq    $RETURN_TO_USER, %r12
>       jmp     _take_trap              /* treat as a trap */
>  END(syscall)
>  
> @@ -1367,6 +1413,8 @@ END(syscall)
>   * the syscall.
>   * Note: emulated syscalls seem to not be used anymore in GNU/Hurd, so they
>   * are not handled here.
> + * Note: added complication: need gs base to be in kernel mode during 
> execution
> + * to read the active thread twice.  Call swapgs twice, once at start and at 
> end.
>   * TODO:
>       - for now we assume the return address is canonical, but apparently 
> there
>         can be cases where it's not (see how Linux handles this). Does it 
> apply
> @@ -1375,6 +1423,8 @@ END(syscall)
>         iretq from return_from_trap, works fine in all combinations
>   */
>  ENTRY(syscall64)
> +     /* interrupts are already disabled */
> +     swapgs
>       /* RFLAGS[32:63] are reserved, so combine syscall num (32 bit) and
>        * eflags in RAX to allow using r11 as temporary register
>        */
> @@ -1420,8 +1470,8 @@ ENTRY(syscall64)
>       mov     %r11,%rbx               /* prepare for error handling */
>       mov     %r10,%rcx               /* fix arg3 location according to C ABI 
> */
>  
> -     /* switch to kernel stack, then we can enable interrupts */
> -     CPU_NUMBER_NO_STACK(%r8b, %r8d, %r8, %r11d, %r11)
> +     /* switch to kernel stack then enable interrupts */
> +     CPU_NUMBER(%r11d)               /* we can call the fast version here */
>       movq    CX(EXT(kernel_stack),%r11),%rsp
>       sti
>  
> @@ -1464,7 +1514,8 @@ _syscall64_call:
>  
>  _syscall64_check_for_ast:
>       /* Check for ast. */
> -     CPU_NUMBER_NO_GS(%r11)
> +     CPU_NUMBER(%r11d)
> +
>       cmpl    $0,CX(EXT(need_ast),%r11)
>       jz      _syscall64_restore_state
>  
> @@ -1513,6 +1564,7 @@ _syscall64_restore_state:
>       mov     R_R15(%r11),%r15        /* callee-preserved register */
>       mov     R_EFLAGS(%r11),%r11     /* sysret convention */
>  
> +     swapgs
>       sysretq         /* fast return to user-space, the thread didn't block */
>  
>  /* Error handling fragments, from here we jump directly to the trap handler 
> */
> @@ -1520,12 +1572,14 @@ _syscall64_addr_push:
>       movq    %r11,R_CR2(%rbx)        /* set fault address */
>       movq    $(T_PAGE_FAULT),R_TRAPNO(%rbx)  /* set page-fault trap */
>       movq    $(T_PF_USER),R_ERR(%rbx) /* set error code - read user space */
> +     movq    $RETURN_TO_USER, %r12
>       jmp     _take_trap              /* treat as a trap */
>  
>  _syscall64_range:
>       movq    $(T_INVALID_OPCODE),R_TRAPNO(%rbx)
>                                       /* set invalid-operation trap */
>       movq    $0,R_ERR(%rbx)          /* clear error code */
> +     movq    $RETURN_TO_USER, %r12
>       jmp     _take_trap              /* treat as a trap */
>  
>  END(syscall64)
> -- 
> 2.51.0
> 
> 
> 

-- 
Samuel
What's this script do?
    unzip ; touch ; finger ; mount ; gasp ; yes ; umount ; sleep
Hint for the answer: not everything is computer-oriented. Sometimes you're
in a sleeping bag, camping out.
(Contributed by Frans van der Zande.)

Reply via email to