On 15.08.2025 22:41, Andrew Cooper wrote:
> Most MSR accesses have compile time constant indexes.  By using the immediate
> form when available, the decoder can start issuing uops directly for the
> relevant MSR, rather than having to issue uops to implement "switch (%ecx)".
> Modern CPUs have tens of thousands of MSRs, so that's quite an if/else chain.
> 
> Create __{rdmsr,wrmsrns}_imm() helpers and use them from {rdmsr,wrmsrns}()
> when the compiler can determine that the msr index is known at compile time.
> 
> At the instruction level, the combined ABI is awkward.  Explain our choices in
> detail.
> 
> Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
> ---
> CC: Jan Beulich <jbeul...@suse.com>
> CC: Roger Pau Monné <roger....@citrix.com>
> 
> The expression wrmsrns(MSR_STAR, rdmsr(MSR_STAR)) now yields:
> 
>   <test_star>:
>       b9 81 00 00 c0          mov    $0xc0000081,%ecx
>       0f 32                   rdmsr
>       48 c1 e2 20             shl    $0x20,%rdx
>       48 09 d0                or     %rdx,%rax
>       48 89 c2                mov    %rax,%rdx
>       48 c1 ea 20             shr    $0x20,%rdx
>       2e 0f 30                cs wrmsr
>       e9 a3 84 e8 ff          jmp    ffff82d040204260 <__x86_return_thunk>
> 
> which is as good as we can manage.  The alternative form of this looks like:
> 
>   <test_star>:
>       b9 81 00 00 c0          mov    $0xc0000081,%ecx
>       c4 e7 7b f6 c0 81 00    rdmsr  $0xc0000081,%rax
>       00 c0
>       2e c4 e7 7a f6 c0 81    cs wrmsrns %rax,$0xc0000081
>       00 00 c0
>       e9 xx xx xx xx          jmp    ffff82d040204260 <__x86_return_thunk>
> 
> Still TBD.  We ought to update the *_safe() forms too.  rdmsr_safe() is easier
> because the potential #GP locations line up, but there need to be two variants
> because of

Because of ...?

> --- a/xen/arch/x86/include/asm/alternative.h
> +++ b/xen/arch/x86/include/asm/alternative.h
> @@ -151,6 +151,13 @@ extern void alternative_instructions(void);
>          ALTERNATIVE(oldinstr, newinstr, feature)                        \
>          :: input )
>  
> +#define alternative_input_2(oldinstr, newinstr1, feature1,              \
> +                            newinstr2, feature2, input...)              \
> +    asm_inline volatile (                                               \
> +        ALTERNATIVE_2(oldinstr, newinstr1, feature1,                    \
> +                      newinstr2, feature2)                              \
> +        :: input )
> +
>  /* Like alternative_input, but with a single output argument */
>  #define alternative_io(oldinstr, newinstr, feature, output, input...)   \
>      asm_inline volatile (                                               \
> diff --git a/xen/arch/x86/include/asm/msr.h b/xen/arch/x86/include/asm/msr.h
> index 1bd27b989a4d..2ceff6cca8bb 100644
> --- a/xen/arch/x86/include/asm/msr.h
> +++ b/xen/arch/x86/include/asm/msr.h
> @@ -29,10 +29,52 @@
>   *  wrmsrl(MSR_FOO, val);
>   */
>  
> -static inline uint64_t rdmsr(unsigned int msr)
> +/*
> + * RDMSR with a compile-time constant index, when available.  Falls back to
> + * plain RDMSR.
> + */
> +static always_inline uint64_t __rdmsr_imm(uint32_t msr)
> +{
> +    uint64_t val;
> +
> +    /*
> +     * For best performance, RDMSR $msr, %r64 is recommended.  For
> +     * compatibility, we need to fall back to plain RDMSR.
> +     *
> +     * The combined ABI is awkward, because RDMSR $imm produces an r64,
> +     * whereas WRMSR{,NS} produces a split edx:eax pair.
> +     *
> +     * Always use RDMSR $imm, %rax, because it has the most in common with 
> the
> +     * legacy form.  When MSR_IMM isn't available, emit logic to fold %edx
> +     * back into %rax.
> +     *
> +     * Let the compiler do %ecx setup.  This does mean there's a useless `mov
> +     * $imm, %ecx` in the instruction stream in the MSR_IMM case, but it 
> means
> +     * the compiler can de-duplicate the setup in the common case of reading
> +     * and writing the same MSR.
> +     */
> +    alternative_io(
> +        "rdmsr\n\t"
> +        "shl $32, %%rdx\n\t"
> +        "or %%rdx, %%rax\n\t",
> +
> +        /* RDMSR $msr, %rax */
> +        ".byte 0xc4,0xe7,0x7b,0xf6,0xc0; .long %c[msr]", X86_FEATURE_MSR_IMM,
> +
> +        "=a" (val),

Strictly speaking "=&a". Not that it matters much here; just to not
set a bad precedent.

> @@ -55,11 +97,51 @@ static inline void wrmsr(unsigned int msr, uint64_t val)
>  }
>  #define wrmsrl(msr, val) wrmsr(msr, val)
>  
> +/*
> + * Non-serialising WRMSR with a compile-time constant index, when available.
> + * Falls back to plain WRMSRNS, or to a serialising WRMSR.
> + */
> +static always_inline void __wrmsrns_imm(uint32_t msr, uint64_t val)
> +{
> +    /*
> +     * For best performance, WRMSRNS %r64, $msr is recommended.  For
> +     * compatibility, we need to fall back to plain WRMSRNS, or to WRMSR.
> +     *
> +     * The combined ABI is awkward, because WRMSRNS $imm takes a single r64,
> +     * whereas WRMSR{,NS} takes a split edx:eax pair.
> +     *
> +     * Always use WRMSRNS %rax, $imm, because it has the most in common with
> +     * the legacy forms.  When MSR_IMM isn't available, emit setup logic for
> +     * %edx.
> +     *
> +     * Let the compiler do %ecx setup.  This does mean there's a useless `mov
> +     * $imm, %ecx` in the instruction stream in the MSR_IMM case, but it 
> means
> +     * the compiler can de-duplicate the setup in the common case of reading
> +     * and writing the same MSR.
> +     */
> +    alternative_input_2(
> +        "mov %%rax, %%rdx\n\t"
> +        "shr $32, %%rdx\n\t"
> +        ".byte 0x2e; wrmsr",
> +
> +        /* CS WRMSRNS %rax, $msr */
> +        ".byte 0x2e,0xc4,0xe7,0x7a,0xf6,0xc0; .long %c[msr]", 
> X86_FEATURE_MSR_IMM,
> +
> +        "mov %%rax, %%rdx\n\t"
> +        "shr $32, %%rdx\n\t"
> +        ".byte 0x0f,0x01,0xc6", X86_FEATURE_WRMSRNS,

Isn't this the wrong way round for hardware which has both features? The
last active alternative wins, iirc.

Jan

Reply via email to