On 28.08.2025 17:03, Andrew Cooper wrote:
> Under FRED, there's one entrypoint from Ring 3, and one from Ring 0.
> 
> FRED gives us a good stack (even for SYSCALL/SYSENTER), and a unified event
> frame on the stack, meaing that all software needs to do is spill the GPRs
> with a line of PUSHes.  Introduce PUSH_AND_CLEAR_GPRS and POP_GPRS for this
> purpose.
> 
> Introduce entry_FRED_R0() which to a first appoximation is complete for all
> event handling within Xen.
> 
> entry_FRED_R0() needs deriving from entry_FRED_R3(), so introduce a basic
> handler.  There is more work required to make the return-to-guest path work
> under FRED, so leave a BUG clearly in place.
> 
> Also introduce entry_from_{xen,pv}() to be the C level handlers.  By simply
> copying regs->fred_ss.vector into regs->entry_vector, we can reuse all the
> existing fault handlers.
> 
> Extend fatal_trap() to render the event type, including by name, when FRED is
> active.  This is slightly complicated, because X86_ET_OTHER must not use
> vector_name() or SYSCALL and SYSENTER get rendered as #BP and #DB.  Also,
> {read,write}_gs_shadow() needs modifying to avoid the SWAPGS instruction,
> which is disallowed in FRED mode.

This last sentence looks to be stale now.

> --- a/xen/arch/x86/include/asm/asm_defns.h
> +++ b/xen/arch/x86/include/asm/asm_defns.h
> @@ -315,6 +315,71 @@ static always_inline void stac(void)
>          subq  $-(UREGS_error_code-UREGS_r15+\adj), %rsp
>  .endm
>  
> +/*
> + * Push and clear GPRs
> + */
> +.macro PUSH_AND_CLEAR_GPRS
> +        push  %rdi
> +        xor   %edi, %edi
> +        push  %rsi
> +        xor   %esi, %esi
> +        push  %rdx
> +        xor   %edx, %edx
> +        push  %rcx
> +        xor   %ecx, %ecx
> +        push  %rax
> +        xor   %eax, %eax
> +        push  %r8
> +        xor   %r8d, %r8d
> +        push  %r9
> +        xor   %r9d, %r9d
> +        push  %r10
> +        xor   %r10d, %r10d
> +        push  %r11
> +        xor   %r11d, %r11d
> +        push  %rbx
> +        xor   %ebx, %ebx
> +        push  %rbp
> +#ifdef CONFIG_FRAME_POINTER
> +/* Indicate special exception stack frame by inverting the frame pointer. */
> +        mov   %rsp, %rbp
> +        notq  %rbp
> +#else
> +        xor   %ebp, %ebp
> +#endif
> +        push  %r12
> +        xor   %r12d, %r12d
> +        push  %r13
> +        xor   %r13d, %r13d
> +        push  %r14
> +        xor   %r14d, %r14d
> +        push  %r15
> +        xor   %r15d, %r15d
> +.endm
> +
> +/*
> + * POP GPRs from a UREGS_* frame on the stack.  Does not modify flags.
> + *
> + * @rax: Alternative destination for the %rax value on the stack.
> + */
> +.macro POP_GPRS rax=%rax

The parameter isn't really needed, at least not just yet. Do you have firm
plans for using it (presumably in the SVM code ahead of VMRUN)? Else I'd
suggest to omit it, as it's fragile anyway: One can't use an arbitrary
register for it; only ...

> +        pop   %r15
> +        pop   %r14
> +        pop   %r13
> +        pop   %r12
> +        pop   %rbp
> +        pop   %rbx
> +        pop   %r11
> +        pop   %r10
> +        pop   %r9
> +        pop   %r8
> +        pop   \rax
> +        pop   %rcx
> +        pop   %rdx
> +        pop   %rsi
> +        pop   %rdi

... the ones POPed later are candidates. Their order isn't really visible
at use sites of the macro, though. (A possibility would be to indicate
via argument only _that_ the %rax slot wants discarding, without indicating
by use of which register.)

> --- a/xen/arch/x86/traps.c
> +++ b/xen/arch/x86/traps.c
> @@ -89,6 +89,13 @@ const unsigned int nmi_cpu;
>  #define stack_words_per_line 4
>  #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)(regs)->rsp)
>  
> +/* Only valid to use when FRED is active. */
> +static inline struct fred_info *cpu_regs_fred_info(struct cpu_user_regs 
> *regs)
> +{
> +    ASSERT(read_cr4() & X86_CR4_FRED);
> +    return (void *)(regs + 1);

Maybe better

    &container_of(regs, struct cpu_info, guest_cpu_user_regs)->_fred;

?

> @@ -1101,9 +1134,41 @@ void fatal_trap(const struct cpu_user_regs *regs, bool 
> show_remote)
>          }
>      }
>  
> -    panic("FATAL TRAP: vec %u, %s[%04x]%s\n",
> -          trapnr, vector_name(trapnr), regs->error_code,
> -          (regs->eflags & X86_EFLAGS_IF) ? "" : " IN INTERRUPT CONTEXT");
> +    if ( read_cr4() & X86_CR4_FRED )
> +    {
> +        bool render_ec = false;
> +        const char *vec_name = NULL;
> +
> +        switch ( regs->fred_ss.type )
> +        {
> +        case X86_ET_HW_EXC:
> +        case X86_ET_PRIV_SW_EXC:
> +        case X86_ET_SW_EXC:
> +            render_ec = true;
> +            vec_name = vector_name(regs->fred_ss.vector);
> +            break;
> +
> +        case X86_ET_OTHER:
> +            vec_name = x86_et_other_name(regs->fred_ss.vector);
> +            break;
> +        }
> +
> +        if ( render_ec )
> +            panic("Fatal TRAP: type %u, %s, vec %u, %s[%04x]%s\n",

Is it deliberate that here and ...

> +                  regs->fred_ss.type, x86_et_name(regs->fred_ss.type),
> +                  regs->fred_ss.vector, vec_name ?: "",
> +                  regs->error_code,
> +                  (regs->eflags & X86_EFLAGS_IF) ? "" : " IN INTERRUPT 
> CONTEXT");
> +        else
> +            panic("Fatal TRAP: type %u, %s, vec %u, %s%s\n",

.. here it's "Fatal", when ....

> +                  regs->fred_ss.type, x86_et_name(regs->fred_ss.type),
> +                  regs->fred_ss.vector, vec_name ?: "",
> +                  (regs->eflags & X86_EFLAGS_IF) ? "" : " IN INTERRUPT 
> CONTEXT");
> +    }
> +    else
> +        panic("FATAL TRAP: vec %u, %s[%04x]%s\n",

... here it's "FATAL"? (Personally I prefer the all capitals form.)

> +              trapnr, vector_name(trapnr), regs->error_code,
> +              (regs->eflags & X86_EFLAGS_IF) ? "" : " IN INTERRUPT CONTEXT");

Just as a remark (the "else" thing still needs sorting) - without "else"
this would be a smaller diff.

> @@ -2198,6 +2263,94 @@ void asmlinkage check_ist_exit(const struct 
> cpu_user_regs *regs, bool ist_exit)
>  }
>  #endif
>  
> +void asmlinkage entry_from_pv(struct cpu_user_regs *regs)
> +{
> +    /* Copy fred_ss.vector into entry_vector as IDT delivery would have 
> done. */
> +    regs->entry_vector = regs->fred_ss.vector;
> +
> +    fatal_trap(regs, false);
> +}
> +
> +void asmlinkage entry_from_xen(struct cpu_user_regs *regs)
> +{
> +    struct fred_info *fi = cpu_regs_fred_info(regs);
> +    uint8_t type = regs->fred_ss.type;
> +
> +    /* Copy fred_ss.vector into entry_vector as IDT delivery would have 
> done. */
> +    regs->entry_vector = regs->fred_ss.vector;
> +
> +    /*
> +     * First, handle the asynchronous or fatal events.  These are either
> +     * unrelated to the interrupted context, or may not have valid context
> +     * recorded, and all have special rules on how/whether to re-enable IRQs.
> +     */
> +    switch ( type )
> +    {
> +    case X86_ET_EXT_INTR:
> +        return do_IRQ(regs);
> +
> +    case X86_ET_NMI:
> +        return do_nmi(regs);
> +
> +    case X86_ET_HW_EXC:
> +        switch ( regs->fred_ss.vector )
> +        {
> +        case X86_EXC_DF: return do_double_fault(regs);
> +        case X86_EXC_MC: return do_machine_check(regs);
> +        }
> +        break;
> +    }
> +
> +    /*
> +     * With the asynchronous events handled, what remains are the synchronous
> +     * ones.  If we interrupted an IRQs-on region, we should re-enable IRQs
> +     * now; for #PF and #DB, %cr2 and %dr6 are on the stack in edata.
> +     */
> +    if ( regs->eflags & X86_EFLAGS_IF )
> +        local_irq_enable();

Ah yes, here we go (as to my question on an earlier patch).

> +    switch ( type )
> +    {
> +    case X86_ET_HW_EXC:
> +    case X86_ET_PRIV_SW_EXC:
> +    case X86_ET_SW_EXC:
> +        switch ( regs->fred_ss.vector )
> +        {
> +        case X86_EXC_PF:  handle_PF(regs, fi->edata); break;
> +        case X86_EXC_GP:  do_general_protection(regs); break;
> +        case X86_EXC_UD:  do_invalid_op(regs); break;
> +        case X86_EXC_NM:  do_device_not_available(regs); break;
> +        case X86_EXC_BP:  do_int3(regs); break;
> +        case X86_EXC_DB:  handle_DB(regs, fi->edata); break;
> +
> +        case X86_EXC_DE:
> +        case X86_EXC_OF:
> +        case X86_EXC_BR:
> +        case X86_EXC_NP:
> +        case X86_EXC_SS:
> +        case X86_EXC_MF:
> +        case X86_EXC_AC:
> +        case X86_EXC_XM:
> +            do_trap(regs);
> +            break;
> +
> +        case X86_EXC_CP:  do_entry_CP(regs); break;

Any reason this isn't grouped with the other single-line cases?

Jan

Reply via email to