On Mon, Jan 24, 2022 at 11:16:36AM +0100, Paolo Bonzini wrote:
> On 1/24/22 08:55, Yang Zhong wrote:
> >Kernel allocates 4K xstate buffer by default. For XSAVE features
> >which require large state component (e.g. AMX), Linux kernel
> >dynamically expands the xstate buffer only after the process has
> >acquired the necessary permissions. Those are called dynamically-
> >enabled XSAVE features (or dynamic xfeatures).
> >
> >There are separate permissions for native tasks and guests.
> >
> >Qemu should request the guest permissions for dynamic xfeatures
> >which will be exposed to the guest. This only needs to be done
> >once before the first vcpu is created.
> >
> >Suggested-by: Paolo Bonzini <pbonz...@redhat.com>
> >Signed-off-by: Yang Zhong <yang.zh...@intel.com>
> >Signed-off-by: Jing Liu <jing2....@intel.com>
> >Signed-off-by: Wei Wang <wei.w.w...@intel.com>
> >---
> >  target/i386/cpu.h         |  7 +++++++
> >  target/i386/cpu.c         | 31 +++++++++++++++++++++++++++++++
> >  target/i386/kvm/kvm-cpu.c | 12 ++++++------
> >  target/i386/kvm/kvm.c     |  6 ++++++
> >  4 files changed, 50 insertions(+), 6 deletions(-)
> >
> >diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> >index 06d2d6bccf..d4ad0f56bd 100644
> >--- a/target/i386/cpu.h
> >+++ b/target/i386/cpu.h
> >@@ -549,6 +549,13 @@ typedef enum X86Seg {
> >  #define XSTATE_ZMM_Hi256_MASK           (1ULL << XSTATE_ZMM_Hi256_BIT)
> >  #define XSTATE_Hi16_ZMM_MASK            (1ULL << XSTATE_Hi16_ZMM_BIT)
> >  #define XSTATE_PKRU_MASK                (1ULL << XSTATE_PKRU_BIT)
> >+#define XSTATE_XTILE_CFG_MASK           (1ULL << XSTATE_XTILE_CFG_BIT)
> >+#define XSTATE_XTILE_DATA_MASK          (1ULL << XSTATE_XTILE_DATA_BIT)
> >+#define XFEATURE_XTILE_MASK             (XSTATE_XTILE_CFG_MASK \
> >+                                         | XSTATE_XTILE_DATA_MASK)
> >+
> >+#define ARCH_GET_XCOMP_GUEST_PERM       0x1024
> >+#define ARCH_REQ_XCOMP_GUEST_PERM       0x1025
> >  #define ESA_FEATURE_ALIGN64_BIT         1
> >diff --git a/target/i386/cpu.c b/target/i386/cpu.c
> >index 3390820745..29b0348c25 100644
> >--- a/target/i386/cpu.c
> >+++ b/target/i386/cpu.c
> >@@ -43,6 +43,10 @@
> >  #include "disas/capstone.h"
> >  #include "cpu-internal.h"
> >+#include <sys/syscall.h>
> >+
> >+bool request_perm;
> >+
> >  /* Helpers for building CPUID[2] descriptors: */
> >  struct CPUID2CacheDescriptorInfo {
> >@@ -6000,6 +6004,27 @@ static void x86_cpu_adjust_feat_level(X86CPU *cpu, 
> >FeatureWord w)
> >      }
> >  }
> >+static void kvm_request_xsave_components(X86CPU *cpu, uint32_t bit)
> >+{
> >+    KVMState *s = CPU(cpu)->kvm_state;
> >+
> >+    long rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM,
> >+                      bit);
> >+    if (rc) {
> >+        /*
> >+         * The older kernel version(<5.15) can't support
> >+         * ARCH_REQ_XCOMP_GUEST_PERM and directly return.
> >+         */
> >+        return;
> >+    }
> >+
> >+    rc = kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EAX);
> >+    if (!(rc & XFEATURE_XTILE_MASK)) {
> >+        error_report("get cpuid failure and rc=0x%lx", rc);
> >+        exit(EXIT_FAILURE);
> >+    }
> >+}
> >+
> >  /* Calculate XSAVE components based on the configured CPU feature flags */
> >  static void x86_cpu_enable_xsave_components(X86CPU *cpu)
> >  {
> >@@ -6021,6 +6046,12 @@ static void x86_cpu_enable_xsave_components(X86CPU 
> >*cpu)
> >          }
> >      }
> >+    /* Only request permission from fisrt vcpu. */
> >+    if (kvm_enabled() && !request_perm) {
> >+        kvm_request_xsave_components(cpu, XSTATE_XTILE_DATA_BIT);
> >+        request_perm = true;
> >+    }
> 
> You should pass "mask" here, or "mask & XSTATE_DYNAMIC_MASK" so that
> the components are only requested if necessary.

  Thanks, I will pass "mask" here, which can make kvm_request_xsave_components()
  reused in the future.

  Yang 

> 
> >      env->features[FEAT_XSAVE_COMP_LO] = mask;
> >      env->features[FEAT_XSAVE_COMP_HI] = mask >> 32;
> >  }
> >diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c
> >index 033ca011ea..5ab6a0b9d2 100644
> >--- a/target/i386/kvm/kvm-cpu.c
> >+++ b/target/i386/kvm/kvm-cpu.c
> >@@ -84,7 +84,7 @@ static void kvm_cpu_max_instance_init(X86CPU *cpu)
> >  static void kvm_cpu_xsave_init(void)
> >  {
> >      static bool first = true;
> >-    KVMState *s = kvm_state;
> >+    uint32_t eax, ebx, ecx, edx;
> >      int i;
> >      if (!first) {
> >@@ -100,13 +100,13 @@ static void kvm_cpu_xsave_init(void)
> >          ExtSaveArea *esa = &x86_ext_save_areas[i];
> >          if (esa->size) {
> >-            int sz = kvm_arch_get_supported_cpuid(s, 0xd, i, R_EAX);
> >-            if (sz != 0) {
> >-                assert(esa->size == sz);
> >-                esa->offset = kvm_arch_get_supported_cpuid(s, 0xd, i, 
> >R_EBX);
> >+            host_cpuid(0xd, i, &eax, &ebx, &ecx, &edx);
> >+            if (eax != 0) {
> >+                assert(esa->size == eax);
> >+                esa->offset = ebx;
> >              }
> >-            esa->ecx = kvm_arch_get_supported_cpuid(s, 0xd, i, R_ECX);
> >+            esa->ecx = ecx;
> 
> I think esa->ecx should be assigned inside the "if".  This is also
> true in patch 1.

  Yes, you are right, thanks!
  
  Yang

> 
> >          }
> >      }
> >  }
> >diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> >index 2c8feb4a6f..caf1388d8b 100644
> >--- a/target/i386/kvm/kvm.c
> >+++ b/target/i386/kvm/kvm.c
> >@@ -405,6 +405,12 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, 
> >uint32_t function,
> >          if (!has_msr_arch_capabs) {
> >              ret &= ~CPUID_7_0_EDX_ARCH_CAPABILITIES;
> >          }
> >+    } else if (function == 0xd && index == 0 && reg == R_EAX) {
> >+        /*
> >+         * We can set the AMX XTILE DATA flag, even if KVM does not
> >+         * return it on GET_SUPPORTED_CPUID.
> >+         */
> >+        ret |= XSTATE_XTILE_DATA_MASK;
> >      } else if (function == 0x80000001 && reg == R_ECX) {
> >          /*
> >           * It's safe to enable TOPOEXT even if it's not returned by
> >
> 
> Instead of setting XSTATE_XTILE_DATA_MASK blindly, you need
> something like arch_prctl(ARCH_GET_XCOMP_GUEST_SUPP).  However, this
> arch_prctl only exists in the bare-metal version
> ARCH_GET_XCOMP_SUPP, and it would need access to the variable
> supported_xcr0 that KVM exports.  So I think it's better to
> implement it as a new KVM_CHECK_EXTENSION value instead of a prctl.
> 
  
  Thanks Paolo, from your below KVM changes:
  
https://lore.kernel.org/kvm/20220126152210.3044876-3-pbonz...@redhat.com/T/#m7bf9a03c47c29d21deb78604bc290a45aa5e98f5

  So the changes in kvm_arch_get_supported_cpuid() like below? 
  +    } else if (function == 0xd && index == 0 && reg == R_EAX) {
  +     struct kvm_device_attr attr = {
  +             .group = 0,
  +             .attr = KVM_X86_XCOMP_GUEST_SUPP,
  +             .addr = (unsigned long) &bitmask
  +     };
  +
  +     kvm_fd = open_kvm_dev_path_or_exit();
  +     rc = ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr);
  +     close(kvm_fd);
  +     ret = bitmask;
  +   } 

  This can get supported_xcr0 from KVM side.

  So in the kvm_request_xsave_components(), we can do below steps?
 
  +    /* Check supported_xr0 firstly */ 
  +    rc = kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EAX);
  +    if (!(rc & XFEATURE_XTILE_MASK)) {
  +        error_report("host xcr0 can't support AMX xdata and rc=0x%lx", rc);
  +        exit(EXIT_FAILURE);
  +    }

  +   /* request amx permission */
  +   syscall(ARCH_REQ_XCOMP_GUEST_PERM, xdata_bit);


  +   /* check amx permission */
  +   syscall(ARCH_GET_XCOMP_GUEST_PERM);
 
  Please help check those steps, thanks!

  Yang 

> Paolo

Reply via email to