On 14/08/2025 11:52 am, Jan Beulich wrote:
> On 14.08.2025 12:17, Andrew Cooper wrote:
>> On 14/08/2025 9:58 am, Jan Beulich wrote:
>>> On 13.08.2025 13:53, Andrew Cooper wrote:
>>>> On 12/08/2025 10:52 am, Jan Beulich wrote:
>>>>> On 11.08.2025 10:17, Andrew Cooper wrote:
>>>>>> On 08/08/2025 9:23 pm, Andrew Cooper wrote:
>>>>>>> ... along with the supporting functions.  Switch to Xen coding style, 
>>>>>>> and make
>>>>>>> static as there are no external callers.
>>>>>>>
>>>>>>> Rename to legacy_syscall_init() as a more accurate name.
>>>>>>>
>>>>>>> No functional change.
>>>>>>>
>>>>>>> Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
>>>>>>> ---
>>>>>>> CC: Jan Beulich <jbeul...@suse.com>
>>>>>>> CC: Roger Pau Monné <roger....@citrix.com>
>>>>>>> ---
>>>>>>>  xen/arch/x86/include/asm/system.h |  2 -
>>>>>>>  xen/arch/x86/traps-setup.c        | 97 ++++++++++++++++++++++++++++++-
>>>>>>>  xen/arch/x86/x86_64/traps.c       | 92 -----------------------------
>>>>>>>  3 files changed, 95 insertions(+), 96 deletions(-)
>>>>>>>
>>>>>>> diff --git a/xen/arch/x86/include/asm/system.h 
>>>>>>> b/xen/arch/x86/include/asm/system.h
>>>>>>> index 3cdc56e4ba6d..6c2800d8158d 100644
>>>>>>> --- a/xen/arch/x86/include/asm/system.h
>>>>>>> +++ b/xen/arch/x86/include/asm/system.h
>>>>>>> @@ -256,6 +256,4 @@ static inline int local_irq_is_enabled(void)
>>>>>>>  #define BROKEN_ACPI_Sx          0x0001
>>>>>>>  #define BROKEN_INIT_AFTER_S1    0x0002
>>>>>>>  
>>>>>>> -void subarch_percpu_traps_init(void);
>>>>>>> -
>>>>>>>  #endif
>>>>>>> diff --git a/xen/arch/x86/traps-setup.c b/xen/arch/x86/traps-setup.c
>>>>>>> index 13b8fcf0ba51..fbae7072c292 100644
>>>>>>> --- a/xen/arch/x86/traps-setup.c
>>>>>>> +++ b/xen/arch/x86/traps-setup.c
>>>>>>> @@ -2,13 +2,15 @@
>>>>>>>  /*
>>>>>>>   * Configuration of event handling for all CPUs.
>>>>>>>   */
>>>>>>> +#include <xen/domain_page.h>
>>>>>>>  #include <xen/init.h>
>>>>>>>  #include <xen/param.h>
>>>>>>>  
>>>>>>> +#include <asm/endbr.h>
>>>>>>>  #include <asm/idt.h>
>>>>>>>  #include <asm/msr.h>
>>>>>>>  #include <asm/shstk.h>
>>>>>>> -#include <asm/system.h>
>>>>>>> +#include <asm/stubs.h>
>>>>>>>  #include <asm/traps.h>
>>>>>>>  
>>>>>>>  DEFINE_PER_CPU_READ_MOSTLY(idt_entry_t *, idt);
>>>>>>> @@ -19,6 +21,8 @@ static bool __initdata opt_ler;
>>>>>>>  boolean_param("ler", opt_ler);
>>>>>>>  
>>>>>>>  void nocall entry_PF(void);
>>>>>>> +void nocall lstar_enter(void);
>>>>>>> +void nocall cstar_enter(void);
>>>>>>>  
>>>>>>>  /*
>>>>>>>   * Sets up system tables and descriptors for IDT devliery.
>>>>>>> @@ -138,6 +142,95 @@ static void load_system_tables(void)
>>>>>>>      BUG_ON(stack_bottom & 15);
>>>>>>>  }
>>>>>>>  
>>>>>>> +static unsigned int write_stub_trampoline(
>>>>>>> +    unsigned char *stub, unsigned long stub_va,
>>>>>>> +    unsigned long stack_bottom, unsigned long target_va)
>>>>>>> +{
>>>>>>> +    unsigned char *p = stub;
>>>>>>> +
>>>>>>> +    if ( cpu_has_xen_ibt )
>>>>>>> +    {
>>>>>>> +        place_endbr64(p);
>>>>>>> +        p += 4;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    /* Store guest %rax into %ss slot */
>>>>>>> +    /* movabsq %rax, stack_bottom - 8 */
>>>>>>> +    *p++ = 0x48;
>>>>>>> +    *p++ = 0xa3;
>>>>>>> +    *(uint64_t *)p = stack_bottom - 8;
>>>>>>> +    p += 8;
>>>>>>> +
>>>>>>> +    /* Store guest %rsp in %rax */
>>>>>>> +    /* movq %rsp, %rax */
>>>>>>> +    *p++ = 0x48;
>>>>>>> +    *p++ = 0x89;
>>>>>>> +    *p++ = 0xe0;
>>>>>>> +
>>>>>>> +    /* Switch to Xen stack */
>>>>>>> +    /* movabsq $stack_bottom - 8, %rsp */
>>>>>>> +    *p++ = 0x48;
>>>>>>> +    *p++ = 0xbc;
>>>>>>> +    *(uint64_t *)p = stack_bottom - 8;
>>>>>>> +    p += 8;
>>>>>>> +
>>>>>>> +    /* jmp target_va */
>>>>>>> +    *p++ = 0xe9;
>>>>>>> +    *(int32_t *)p = target_va - (stub_va + (p - stub) + 4);
>>>>>>> +    p += 4;
>>>>>>> +
>>>>>>> +    /* Round up to a multiple of 16 bytes. */
>>>>>>> +    return ROUNDUP(p - stub, 16);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static void legacy_syscall_init(void)
>>>>>>> +{
>>>>>>> +    unsigned long stack_bottom = get_stack_bottom();
>>>>>>> +    unsigned long stub_va = this_cpu(stubs.addr);
>>>>>>> +    unsigned char *stub_page;
>>>>>>> +    unsigned int offset;
>>>>>>> +
>>>>>>> +    /* No PV guests?  No need to set up SYSCALL/SYSENTER 
>>>>>>> infrastructure. */
>>>>>>> +    if ( !IS_ENABLED(CONFIG_PV) )
>>>>>>> +        return;
>>>>>>> +
>>>>>>> +    stub_page = map_domain_page(_mfn(this_cpu(stubs.mfn)));
>>>>>>> +
>>>>>>> +    /*
>>>>>>> +     * Trampoline for SYSCALL entry from 64-bit mode.  The VT-x HVM 
>>>>>>> vcpu
>>>>>>> +     * context switch logic relies on the SYSCALL trampoline being at 
>>>>>>> the
>>>>>>> +     * start of the stubs.
>>>>>>> +     */
>>>>>>> +    wrmsrl(MSR_LSTAR, stub_va);
>>>>>>> +    offset = write_stub_trampoline(stub_page + (stub_va & ~PAGE_MASK),
>>>>>>> +                                   stub_va, stack_bottom,
>>>>>>> +                                   (unsigned long)lstar_enter);
>>>>>>> +    stub_va += offset;
>>>>>>> +
>>>>>>> +    if ( cpu_has_sep )
>>>>>>> +    {
>>>>>>> +        /* SYSENTER entry. */
>>>>>>> +        wrmsrl(MSR_IA32_SYSENTER_ESP, stack_bottom);
>>>>>>> +        wrmsrl(MSR_IA32_SYSENTER_EIP, (unsigned long)sysenter_entry);
>>>>>>> +        wrmsr(MSR_IA32_SYSENTER_CS, __HYPERVISOR_CS, 0);
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    /* Trampoline for SYSCALL entry from compatibility mode. */
>>>>>>> +    wrmsrl(MSR_CSTAR, stub_va);
>>>>>>> +    offset += write_stub_trampoline(stub_page + (stub_va & ~PAGE_MASK),
>>>>>>> +                                    stub_va, stack_bottom,
>>>>>>> +                                    (unsigned long)cstar_enter);
>>>>>>> +
>>>>>>> +    /* Don't consume more than half of the stub space here. */
>>>>>>> +    ASSERT(offset <= STUB_BUF_SIZE / 2);
>>>>>>> +
>>>>>>> +    unmap_domain_page(stub_page);
>>>>>>> +
>>>>>>> +    /* Common SYSCALL parameters. */
>>>>>>> +    wrmsrl(MSR_STAR, XEN_MSR_STAR);
>>>>>>> +    wrmsrl(MSR_SYSCALL_MASK, XEN_SYSCALL_MASK);
>>>>>>> +}
>>>>>> These want adjusting to use wrmsrns(), similarly to the previous patch. 
>>>>>> Fixed locally.
>>>>> Also the one higher in the function, I suppose.
>>>> All of them.
>>>>
>>>> I'm not aware of anywhere were we want serialising behaviour, except for
>>>> ICR which is buggly non-serialising and has workarounds.
>>>>
>>>> But I'm also not sure enough of this to suggest that we make wrmsr() be
>>>> wrmsrns() by default.
>>> I'm pretty sure we don't want this. If nothing else then to avoid code bloat
>>> for MSR writes which are non-serializing even in the original form.
>> Even that's complicated.
>>
>> For FRED, FS/GS_BASE/KERN need changes because the lack of SWAPGS forces
>> MSR accesses even if we do have FSGSBASE active.
>>
>> Writes to these were made non-serialising in Zen2 and later, but are
>> still serialising on Intel.  i.e. they need converting to WRMSRNS even
>> though plain WRMSR would be "fine" on all AMD systems (either because
>> it's the only option, or because it's non-serialising).
> Right, such would need converting. But x2APIC MSR accesses, for example,
> should have a need.

For serialising-ness, yes, but they still want to be MSR_IMM when
available, at which point the code bloat price is already paid.

~Andrew

Reply via email to