On Wed, Feb 16, 2022 at 10:04:29PM -0800, Yang Zhong wrote: > Kernel allocates 4K xstate buffer by default. For XSAVE features > which require large state component (e.g. AMX), Linux kernel > dynamically expands the xstate buffer only after the process has > acquired the necessary permissions. Those are called dynamically- > enabled XSAVE features (or dynamic xfeatures). > > There are separate permissions for native tasks and guests. > > Qemu should request the guest permissions for dynamic xfeatures > which will be exposed to the guest. This only needs to be done > once before the first vcpu is created. > > KVM implemented one new ARCH_GET_XCOMP_SUPP system attribute API to > get host side supported_xcr0 and Qemu can decide if it can request > dynamically enabled XSAVE features permission. > https://lore.kernel.org/all/20220126152210.3044876-1-pbonz...@redhat.com/ > > Suggested-by: Paolo Bonzini <pbonz...@redhat.com> > Signed-off-by: Yang Zhong <yang.zh...@intel.com> > Signed-off-by: Jing Liu <jing2....@intel.com> > --- > target/i386/cpu.h | 7 +++++++ > target/i386/cpu.c | 43 +++++++++++++++++++++++++++++++++++++++ > target/i386/kvm/kvm-cpu.c | 12 +++++------ > target/i386/kvm/kvm.c | 20 ++++++++++++++++++ > 4 files changed, 76 insertions(+), 6 deletions(-) > > diff --git a/target/i386/cpu.h b/target/i386/cpu.h > index 06d2d6bccf..d4ad0f56bd 100644 > --- a/target/i386/cpu.h > +++ b/target/i386/cpu.h > @@ -549,6 +549,13 @@ typedef enum X86Seg { > #define XSTATE_ZMM_Hi256_MASK (1ULL << XSTATE_ZMM_Hi256_BIT) > #define XSTATE_Hi16_ZMM_MASK (1ULL << XSTATE_Hi16_ZMM_BIT) > #define XSTATE_PKRU_MASK (1ULL << XSTATE_PKRU_BIT) > +#define XSTATE_XTILE_CFG_MASK (1ULL << XSTATE_XTILE_CFG_BIT) > +#define XSTATE_XTILE_DATA_MASK (1ULL << XSTATE_XTILE_DATA_BIT) > +#define XFEATURE_XTILE_MASK (XSTATE_XTILE_CFG_MASK \ > + | XSTATE_XTILE_DATA_MASK) > + > +#define ARCH_GET_XCOMP_GUEST_PERM 0x1024 > +#define ARCH_REQ_XCOMP_GUEST_PERM 0x1025 > > #define ESA_FEATURE_ALIGN64_BIT 1 > > diff --git a/target/i386/cpu.c b/target/i386/cpu.c > index ea7e8f9081..377d993438 100644 > --- a/target/i386/cpu.c > +++ b/target/i386/cpu.c > @@ -43,6 +43,8 @@ > #include "disas/capstone.h" > #include "cpu-internal.h" > > +#include <sys/syscall.h> > + > /* Helpers for building CPUID[2] descriptors: */ > > struct CPUID2CacheDescriptorInfo { > @@ -6000,12 +6002,47 @@ static void x86_cpu_adjust_feat_level(X86CPU *cpu, > FeatureWord w) > } > } > > +static void kvm_request_xsave_components(X86CPU *cpu, uint64_t mask) > +{ > + KVMState *s = kvm_state; > + uint64_t bitmask; > + long rc; > + > + if ((mask & XSTATE_XTILE_DATA_MASK) == XSTATE_XTILE_DATA_MASK) { > + bitmask = kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EAX); > + if (!(bitmask & XSTATE_XTILE_DATA_MASK)) {
Paolo, last time you suggested below changes for here: rc = kvm_arch_get_supported_cpuid(s, 0xd, 0, (xdata_bit < 32 ? R_EAX : R_EDX)); if (!(rc & BIT(xdata_bit & 31)) { ... } Since I used "mask" as parameter here, so I had to directly use R_EAX here. Please review and if need change it to like "(xdata_bit < 32 ? R_EAX : R_EDX)", I will change this in next version, thanks! Yang > + warn_report("no amx support from supported_xcr0, " > + "bitmask:0x%lx", bitmask); > + return; > + } > + > + rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, > + XSTATE_XTILE_DATA_BIT); > + if (rc) { > + /* > + * The older kernel version(<5.15) can't support > + * ARCH_REQ_XCOMP_GUEST_PERM and directly return. > + */ > + return; > + } > + > + rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &bitmask); > + if (rc) { > + warn_report("prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc); > + } else if (!(bitmask & XFEATURE_XTILE_MASK)) { > + warn_report("prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure " > + "and bitmask=0x%lx", bitmask); > + } > + } > +} > + > /* Calculate XSAVE components based on the configured CPU feature flags */ > static void x86_cpu_enable_xsave_components(X86CPU *cpu) > { > CPUX86State *env = &cpu->env; > int i; > uint64_t mask; > + static bool request_perm; > > if (!(env->features[FEAT_1_ECX] & CPUID_EXT_XSAVE)) { > env->features[FEAT_XSAVE_COMP_LO] = 0; > @@ -6021,6 +6058,12 @@ static void x86_cpu_enable_xsave_components(X86CPU > *cpu) > } > } > > + /* Only request permission for first vcpu */ > + if (kvm_enabled() && !request_perm) { > + kvm_request_xsave_components(cpu, mask); > + request_perm = true; > + } > + > env->features[FEAT_XSAVE_COMP_LO] = mask; > env->features[FEAT_XSAVE_COMP_HI] = mask >> 32; > } > diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c > index ce27d3b1df..a35a1bf9fe 100644 > --- a/target/i386/kvm/kvm-cpu.c > +++ b/target/i386/kvm/kvm-cpu.c > @@ -84,7 +84,7 @@ static void kvm_cpu_max_instance_init(X86CPU *cpu) > static void kvm_cpu_xsave_init(void) > { > static bool first = true; > - KVMState *s = kvm_state; > + uint32_t eax, ebx, ecx, edx; > int i; > > if (!first) { > @@ -100,11 +100,11 @@ static void kvm_cpu_xsave_init(void) > ExtSaveArea *esa = &x86_ext_save_areas[i]; > > if (esa->size) { > - int sz = kvm_arch_get_supported_cpuid(s, 0xd, i, R_EAX); > - if (sz != 0) { > - assert(esa->size == sz); > - esa->offset = kvm_arch_get_supported_cpuid(s, 0xd, i, R_EBX); > - esa->ecx = kvm_arch_get_supported_cpuid(s, 0xd, i, R_ECX); > + host_cpuid(0xd, i, &eax, &ebx, &ecx, &edx); > + if (eax != 0) { > + assert(esa->size == eax); > + esa->offset = ebx; > + esa->ecx = ecx; > } > } > } > diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c > index 2c8feb4a6f..3bdcd724c4 100644 > --- a/target/i386/kvm/kvm.c > +++ b/target/i386/kvm/kvm.c > @@ -348,6 +348,7 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, > uint32_t function, > struct kvm_cpuid2 *cpuid; > uint32_t ret = 0; > uint32_t cpuid_1_edx; > + uint64_t bitmask; > > cpuid = get_supported_cpuid(s); > > @@ -405,6 +406,25 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, > uint32_t function, > if (!has_msr_arch_capabs) { > ret &= ~CPUID_7_0_EDX_ARCH_CAPABILITIES; > } > + } else if (function == 0xd && index == 0 && > + (reg == R_EAX || reg == R_EDX)) { > + struct kvm_device_attr attr = { > + .group = 0, > + .attr = KVM_X86_XCOMP_GUEST_SUPP, > + .addr = (unsigned long) &bitmask > + }; > + > + bool sys_attr = kvm_check_extension(s, KVM_CAP_SYS_ATTRIBUTES); > + if (!sys_attr) { > + warn_report("cannot get sys attribute capabilities %d", > sys_attr); > + } > + > + int rc = kvm_ioctl(s, KVM_GET_DEVICE_ATTR, &attr); > + if (rc == -1 && (errno == ENXIO || errno == EINVAL)) { > + warn_report("KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) " > + "error: %d", rc); > + } > + ret = (reg == R_EAX) ? bitmask : bitmask >> 32; > } else if (function == 0x80000001 && reg == R_ECX) { > /* > * It's safe to enable TOPOEXT even if it's not returned by