commit xen for openSUSE:Factory

Source-Sync Tue, 22 Feb 2022 12:18:37 -0800

Script 'mail_helper' called by obssrc
Hello community,

here is the log from the commit of package xen for openSUSE:Factory checked in 
at 2022-02-22 21:17:55
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/xen (Old)
 and      /work/SRC/openSUSE:Factory/.xen.new.1958 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


Package is "xen"

Tue Feb 22 21:17:55 2022 rev:312 rq:956542 version:4.16.0_06

Changes:
--------
--- /work/SRC/openSUSE:Factory/xen/xen.changes  2022-01-27 23:16:44.727069963 
+0100
+++ /work/SRC/openSUSE:Factory/.xen.new.1958/xen.changes        2022-02-22 
21:18:18.214287469 +0100
@@ -1,0 +2,42 @@
+Mon Feb 14 11:40:00 CET 2022 - [email protected]
+
+- Upstream bug fixes (bsc#1027519)
+  61e0296a-x86-time-calibration-relative-counts.patch
+  61e029c8-x86-time-TSC-freq-calibration-accuracy.patch
+  61e02a1c-libxl-PCI-PV-hotplug-stubdom-coldplug.patch
+  61e98e88-x86-introduce-get-set-reg-infra.patch
+  61e98e89-x86-MSR-split-SPEC_CTRL-handling.patch
+  61e98e8a-x86-spec-ctrl-drop-ENTRY-EXIT-HVM.patch
+  61e98e8b-VT-x-SPEC_CTRL-NMI-race-condition.patch
+  61eaaa23-x86-get-set-reg-infra-build.patch
+  61efec1d-Arm-P2M-always-clear-entry-on-mapping-removal.patch
+  61efec4d-gnttab-only-decrement-refcounter-on-final-unmap.patch
+  61efec96-IOMMU-x86-stop-pirq-iteration-immediately-on-error.patch
+  61f2d886-x86-CPUID-disentangle-new-leaves-logic.patch
+  61f2d887-x86-CPUID-leaf-7-1-EBX-infra.patch
+  61f2dd76-x86-SPEC_CTRL-migration-compatibility.patch
+  61f7b2af-libxl-dont-touch-nr_vcpus_out-if-listing.patch
+  61f933a4-x86-cpuid-advertise-SSB_NO.patch
+  61f933a5-x86-drop-use_spec_ctrl-boolean.patch
+  61f933a6-x86-new-has_spec_ctrl-boolean.patch
+  61f933a7-x86-dont-use-spec_ctrl-enter-exit-for-S3.patch
+  61f933a8-x86-SPEC_CTRL-record-last-write.patch
+  61f933a9-x86-SPEC_CTRL-use-common-logic-for-AMD.patch
+  61f933aa-SVM-SPEC_CTRL-entry-exit-logic.patch
+  61f933ab-x86-AMD-SPEC_CTRL-infra.patch
+  61f933ac-SVM-enable-MSR_SPEC_CTRL-for-guests.patch
+  61f946a2-VMX-drop-SPEC_CTRL-load-on-VMEntry.patch
+  6202afa3-x86-clean-up-MSR_MCU_OPT_CTRL-handling.patch
+  6202afa4-x86-TSX-move-has_rtm_always_abort.patch
+  6202afa5-x86-TSX-cope-with-deprecation-on-WHL-R-CFL-R.patch
+  6202afa7-x86-CPUID-leaf-7-2-EDX-infra.patch
+  6202afa8-x86-Intel-PSFD-for-guests.patch
+- Drop patches replaced by the above:
+  xsa393.patch
+  xsa394.patch
+  xsa395.patch
+  libxl-Fix-PV-hotplug-and-stubdom-coldplug.patch
+  libxl-dont-try-to-free-a-NULL-list-of-vcpus.patch
+  libxl-dont-touch-nr_vcpus_out-if-listing-vcpus-and-returning-NULL.patch
+
+-------------------------------------------------------------------

Old:
----
  libxl-Fix-PV-hotplug-and-stubdom-coldplug.patch
  libxl-dont-touch-nr_vcpus_out-if-listing-vcpus-and-returning-NULL.patch
  libxl-dont-try-to-free-a-NULL-list-of-vcpus.patch
  xsa393.patch
  xsa394.patch
  xsa395.patch

New:
----
  61e0296a-x86-time-calibration-relative-counts.patch
  61e029c8-x86-time-TSC-freq-calibration-accuracy.patch
  61e02a1c-libxl-PCI-PV-hotplug-stubdom-coldplug.patch
  61e98e88-x86-introduce-get-set-reg-infra.patch
  61e98e89-x86-MSR-split-SPEC_CTRL-handling.patch
  61e98e8a-x86-spec-ctrl-drop-ENTRY-EXIT-HVM.patch
  61e98e8b-VT-x-SPEC_CTRL-NMI-race-condition.patch
  61eaaa23-x86-get-set-reg-infra-build.patch
  61efec1d-Arm-P2M-always-clear-entry-on-mapping-removal.patch
  61efec4d-gnttab-only-decrement-refcounter-on-final-unmap.patch
  61efec96-IOMMU-x86-stop-pirq-iteration-immediately-on-error.patch
  61f2d886-x86-CPUID-disentangle-new-leaves-logic.patch
  61f2d887-x86-CPUID-leaf-7-1-EBX-infra.patch
  61f2dd76-x86-SPEC_CTRL-migration-compatibility.patch
  61f7b2af-libxl-dont-touch-nr_vcpus_out-if-listing.patch
  61f933a4-x86-cpuid-advertise-SSB_NO.patch
  61f933a5-x86-drop-use_spec_ctrl-boolean.patch
  61f933a6-x86-new-has_spec_ctrl-boolean.patch
  61f933a7-x86-dont-use-spec_ctrl-enter-exit-for-S3.patch
  61f933a8-x86-SPEC_CTRL-record-last-write.patch
  61f933a9-x86-SPEC_CTRL-use-common-logic-for-AMD.patch
  61f933aa-SVM-SPEC_CTRL-entry-exit-logic.patch
  61f933ab-x86-AMD-SPEC_CTRL-infra.patch
  61f933ac-SVM-enable-MSR_SPEC_CTRL-for-guests.patch
  61f946a2-VMX-drop-SPEC_CTRL-load-on-VMEntry.patch
  6202afa3-x86-clean-up-MSR_MCU_OPT_CTRL-handling.patch
  6202afa4-x86-TSX-move-has_rtm_always_abort.patch
  6202afa5-x86-TSX-cope-with-deprecation-on-WHL-R-CFL-R.patch
  6202afa7-x86-CPUID-leaf-7-2-EDX-infra.patch
  6202afa8-x86-Intel-PSFD-for-guests.patch

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ xen.spec ++++++
--- /var/tmp/diff_new_pack.HGscrv/_old  2022-02-22 21:18:24.086288548 +0100
+++ /var/tmp/diff_new_pack.HGscrv/_new  2022-02-22 21:18:24.090288548 +0100
@@ -119,7 +119,7 @@
 %endif
 Provides:       installhint(reboot-needed)
 
-Version:        4.16.0_04
+Version:        4.16.0_06
 Release:        0
 Summary:        Xen Virtualization: Hypervisor (aka VMM aka Microkernel)
 License:        GPL-2.0-only
@@ -159,10 +159,37 @@
 Patch2:         61b88e78-x86-CPUID-TSXLDTRK-definition.patch
 Patch3:         61bc429f-revert-hvmloader-PA-range-should-be-UC.patch
 Patch4:         61d5687a-x86-spec-ctrl-opt_srb_lock-default.patch
+Patch5:         61e0296a-x86-time-calibration-relative-counts.patch
+Patch6:         61e029c8-x86-time-TSC-freq-calibration-accuracy.patch
+Patch7:         61e02a1c-libxl-PCI-PV-hotplug-stubdom-coldplug.patch
+Patch8:         61e98e88-x86-introduce-get-set-reg-infra.patch
+Patch9:         61e98e89-x86-MSR-split-SPEC_CTRL-handling.patch
+Patch10:        61e98e8a-x86-spec-ctrl-drop-ENTRY-EXIT-HVM.patch
+Patch11:        61e98e8b-VT-x-SPEC_CTRL-NMI-race-condition.patch
+Patch12:        61eaaa23-x86-get-set-reg-infra-build.patch
+Patch13:        61efec1d-Arm-P2M-always-clear-entry-on-mapping-removal.patch
+Patch14:        61efec4d-gnttab-only-decrement-refcounter-on-final-unmap.patch
+Patch15:        
61efec96-IOMMU-x86-stop-pirq-iteration-immediately-on-error.patch
+Patch16:        61f2d886-x86-CPUID-disentangle-new-leaves-logic.patch
+Patch17:        61f2d887-x86-CPUID-leaf-7-1-EBX-infra.patch
+Patch18:        61f2dd76-x86-SPEC_CTRL-migration-compatibility.patch
+Patch19:        61f7b2af-libxl-dont-touch-nr_vcpus_out-if-listing.patch
+Patch20:        61f933a4-x86-cpuid-advertise-SSB_NO.patch
+Patch21:        61f933a5-x86-drop-use_spec_ctrl-boolean.patch
+Patch22:        61f933a6-x86-new-has_spec_ctrl-boolean.patch
+Patch23:        61f933a7-x86-dont-use-spec_ctrl-enter-exit-for-S3.patch
+Patch24:        61f933a8-x86-SPEC_CTRL-record-last-write.patch
+Patch25:        61f933a9-x86-SPEC_CTRL-use-common-logic-for-AMD.patch
+Patch26:        61f933aa-SVM-SPEC_CTRL-entry-exit-logic.patch
+Patch27:        61f933ab-x86-AMD-SPEC_CTRL-infra.patch
+Patch28:        61f933ac-SVM-enable-MSR_SPEC_CTRL-for-guests.patch
+Patch29:        61f946a2-VMX-drop-SPEC_CTRL-load-on-VMEntry.patch
+Patch30:        6202afa3-x86-clean-up-MSR_MCU_OPT_CTRL-handling.patch
+Patch31:        6202afa4-x86-TSX-move-has_rtm_always_abort.patch
+Patch32:        6202afa5-x86-TSX-cope-with-deprecation-on-WHL-R-CFL-R.patch
+Patch33:        6202afa7-x86-CPUID-leaf-7-2-EDX-infra.patch
+Patch34:        6202afa8-x86-Intel-PSFD-for-guests.patch
 # EMBARGOED security fixes
-Patch11:        xsa393.patch
-Patch12:        xsa394.patch
-Patch13:        xsa395.patch
 # libxc
 Patch301:       libxc-bitmap-long.patch
 Patch302:       libxc-sr-xl-migration-debug.patch
@@ -224,9 +251,6 @@
 Patch467:       xenstore-run-in-studomain.patch
 Patch468:       libxl.helper_done-crash.patch
 Patch469:       libxl.LIBXL_HOTPLUG_TIMEOUT.patch
-Patch470:       libxl-Fix-PV-hotplug-and-stubdom-coldplug.patch
-Patch471:       libxl-dont-try-to-free-a-NULL-list-of-vcpus.patch
-Patch472:       
libxl-dont-touch-nr_vcpus_out-if-listing-vcpus-and-returning-NULL.patch
 # python3 conversion patches
 Patch500:       build-python3-conversion.patch
 Patch501:       migration-python3-conversion.patch

++++++ 61e0296a-x86-time-calibration-relative-counts.patch ++++++
# Commit 467191641d2a2fd2e43b3ae7b80399f89d339980
# Date 2022-01-13 14:30:18 +0100
# Author Jan Beulich <[email protected]>
# Committer Jan Beulich <[email protected]>
x86/time: use relative counts in calibration loops

Looping until reaching/exceeding a certain value is error prone: If the
target value is close enough to the wrapping point, the loop may not
terminate at all. Switch to using delta values, which then allows to
fold the two loops each into just one.

Fixes: 93340297802b ("x86/time: calibrate TSC against platform timer")
Reported-by: Roger Pau Monn?? <[email protected]>
Signed-off-by: Jan Beulich <[email protected]>
Reviewed-by: Roger Pau Monn?? <[email protected]>

--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -378,7 +378,7 @@ static u64 read_hpet_count(void)
 static int64_t __init init_hpet(struct platform_timesource *pts)
 {
     uint64_t hpet_rate, start;
-    uint32_t count, target;
+    uint32_t count, target, elapsed;
     /*
      * Allow HPET to be setup, but report a frequency of 0 so it's not selected
      * as a timer source. This is required so it can be used in legacy
@@ -451,11 +451,8 @@ static int64_t __init init_hpet(struct p
 
     count = hpet_read32(HPET_COUNTER);
     start = rdtsc_ordered();
-    target = count + CALIBRATE_VALUE(hpet_rate);
-    if ( target < count )
-        while ( hpet_read32(HPET_COUNTER) >= count )
-            continue;
-    while ( hpet_read32(HPET_COUNTER) < target )
+    target = CALIBRATE_VALUE(hpet_rate);
+    while ( (elapsed = hpet_read32(HPET_COUNTER) - count) < target )
         continue;
 
     return (rdtsc_ordered() - start) * CALIBRATE_FRAC;
@@ -493,8 +490,8 @@ static u64 read_pmtimer_count(void)
 
 static s64 __init init_pmtimer(struct platform_timesource *pts)
 {
-    u64 start;
-    u32 count, target, mask;
+    uint64_t start;
+    uint32_t count, target, mask, elapsed;
 
     if ( !pmtmr_ioport || (pmtmr_width != 24 && pmtmr_width != 32) )
         return 0;
@@ -502,13 +499,10 @@ static s64 __init init_pmtimer(struct pl
     pts->counter_bits = pmtmr_width;
     mask = 0xffffffff >> (32 - pmtmr_width);
 
-    count = inl(pmtmr_ioport) & mask;
+    count = inl(pmtmr_ioport);
     start = rdtsc_ordered();
-    target = count + CALIBRATE_VALUE(ACPI_PM_FREQUENCY);
-    if ( target < count )
-        while ( (inl(pmtmr_ioport) & mask) >= count )
-            continue;
-    while ( (inl(pmtmr_ioport) & mask) < target )
+    target = CALIBRATE_VALUE(ACPI_PM_FREQUENCY);
+    while ( (elapsed = (inl(pmtmr_ioport) - count) & mask) < target )
         continue;
 
     return (rdtsc_ordered() - start) * CALIBRATE_FRAC;

++++++ 61e029c8-x86-time-TSC-freq-calibration-accuracy.patch ++++++
# Commit a5c9a80af34eefcd6e31d0ed2b083f452cd9076d
# Date 2022-01-13 14:31:52 +0100
# Author Jan Beulich <[email protected]>
# Committer Jan Beulich <[email protected]>
x86/time: improve TSC / CPU freq calibration accuracy

While the problem report was for extreme errors, even smaller ones would
better be avoided: The calculated period to run calibration loops over
can (and usually will) be shorter than the actual time elapsed between
first and last platform timer and TSC reads. Adjust values returned from
the init functions accordingly.

On a Skylake system I've tested this on accuracy (using HPET) went from
detecting in some cases more than 220kHz too high a value to about
??2kHz. On other systems (or on this system, but with PMTMR) the original
error range was much smaller, with less (in some cases only very little)
improvement.

Reported-by: James Dingwall <[email protected]>
Signed-off-by: Jan Beulich <[email protected]>
Reviewed-by: Roger Pau Monn?? <[email protected]>

--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -287,6 +287,23 @@ static char *freq_string(u64 freq)
     return s;
 }
 
+static uint64_t adjust_elapsed(uint64_t elapsed, uint32_t actual,
+                               uint32_t target)
+{
+    if ( likely(actual > target) )
+    {
+        /*
+         * A (perhaps significant) delay before the last timer read (e.g. due
+         * to a SMI or NMI) can lead to (perhaps severe) inaccuracy if not
+         * accounting for the time elapsed beyond the originally calculated
+         * duration of the calibration interval.
+         */
+        elapsed = muldiv64(elapsed, target, actual);
+    }
+
+    return elapsed * CALIBRATE_FRAC;
+}
+
 /************************************************************
  * PLATFORM TIMER 1: PROGRAMMABLE INTERVAL TIMER (LEGACY PIT)
  */
@@ -455,7 +472,7 @@ static int64_t __init init_hpet(struct p
     while ( (elapsed = hpet_read32(HPET_COUNTER) - count) < target )
         continue;
 
-    return (rdtsc_ordered() - start) * CALIBRATE_FRAC;
+    return adjust_elapsed(rdtsc_ordered() - start, elapsed, target);
 }
 
 static void resume_hpet(struct platform_timesource *pts)
@@ -505,7 +522,7 @@ static s64 __init init_pmtimer(struct pl
     while ( (elapsed = (inl(pmtmr_ioport) - count) & mask) < target )
         continue;
 
-    return (rdtsc_ordered() - start) * CALIBRATE_FRAC;
+    return adjust_elapsed(rdtsc_ordered() - start, elapsed, target);
 }
 
 static struct platform_timesource __initdata plt_pmtimer =

++++++ 61e02a1c-libxl-PCI-PV-hotplug-stubdom-coldplug.patch ++++++

References: bsc#1193307

# Commit 73ee2795aaef2cb086ac078bffe1c6b33c0ea91b
# Date 2022-01-13 14:33:16 +0100
# Author Jason Andryuk <[email protected]>
# Committer Jan Beulich <[email protected]>
libxl/PCI: Fix PV hotplug & stubdom coldplug

commit 0fdb48ffe7a1 "libxl: Make sure devices added by pci-attach are
reflected in the config" broken PCI hotplug (xl pci-attach) for PV
domains when it moved libxl__create_pci_backend() later in the function.

This also broke HVM + stubdom PCI passthrough coldplug.  For that, the
PCI devices are hotplugged to a running PV stubdom, and then the QEMU
QMP device_add commands are made to QEMU inside the stubdom.

A running PV domain calls libxl__wait_for_backend().  With the current
placement of libxl__create_pci_backend(), the path does not exist and
the call immediately fails:
libxl: error: libxl_device.c:1388:libxl__wait_for_backend: Backend 
/local/domain/0/backend/pci/43/0 does not exist
libxl: error: libxl_pci.c:1764:device_pci_add_done: Domain 
42:libxl__device_pci_add failed for PCI device 0:2:0.0 (rc -3)
libxl: error: libxl_create.c:1857:domcreate_attach_devices: Domain 42:unable to 
add pci devices

The wait is only relevant when:
1) The domain is PV
2) The domain is running
3) The backend is already present

This is because:

1) xen-pcifront is only used for PV.  It does not load for HVM domains
   where QEMU is used.

2) If the domain is not running (starting), then the frontend state will
   be Initialising.  xen-pciback waits for the frontend to transition to
   at Initialised before attempting to connect.  So a wait for a
   non-running domain is not applicable as the backend will not
   transition to Connected.

3) For presence, num_devs is already used to determine if the backend
   needs to be created.  Re-use num_devs to determine if the backend
   wait is necessary.  The wait is necessary to avoid racing with
   another PCI attachment reconfiguring the front/back or changing to
   some other state like closing.  If we are creating the backend, then
   we don't have to worry about the state since it is being created.

Fixes: 0fdb48ffe7a1 ("libxl: Make sure devices added by pci-attach are
reflected in the config")

Signed-off-by: Jason Andryuk <[email protected]>
Reviewed-by: Paul Durrant <[email protected]>
Reviewed-by: Anthony PERARD <[email protected]>

--- a/tools/libs/light/libxl_pci.c
+++ b/tools/libs/light/libxl_pci.c
@@ -157,9 +157,11 @@ static int libxl__device_pci_add_xenstor
     if (domtype == LIBXL_DOMAIN_TYPE_INVALID)
         return ERROR_FAIL;
 
-    if (!starting && domtype == LIBXL_DOMAIN_TYPE_PV) {
-        if (libxl__wait_for_backend(gc, be_path, GCSPRINTF("%d", 
XenbusStateConnected)) < 0)
-            return ERROR_FAIL;
+    /* Wait is only needed if the backend already exists (num_devs != NULL) */
+    if (num_devs && !starting && domtype == LIBXL_DOMAIN_TYPE_PV) {
+        rc = libxl__wait_for_backend(gc, be_path,
+                                     GCSPRINTF("%d", XenbusStateConnected));
+        if (rc) return rc;
     }
 
     back = flexarray_make(gc, 16, 1);

++++++ 61e98e88-x86-introduce-get-set-reg-infra.patch ++++++
# Commit 88d3ff7ab15da277a85b39735797293fb541c718
# Date 2022-01-20 16:32:11 +0000
# Author Andrew Cooper <[email protected]>
# Committer Andrew Cooper <[email protected]>
x86/guest: Introduce {get,set}_reg() infrastructure

Various registers have per-guest-type or per-vendor locations or access
requirements.  To support their use from common code, provide accessors which
allow for per-guest-type behaviour.

For now, just infrastructure handling default cases and expectations.
Subsequent patches will start handling registers using this infrastructure.

Signed-off-by: Andrew Cooper <[email protected]>
Reviewed-by: Jan Beulich <[email protected]>

--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -3732,6 +3732,28 @@ gp_fault:
     return X86EMUL_EXCEPTION;
 }
 
+uint64_t hvm_get_reg(struct vcpu *v, unsigned int reg)
+{
+    ASSERT(v == current || !vcpu_runnable(v));
+
+    switch ( reg )
+    {
+    default:
+        return alternative_call(hvm_funcs.get_reg, v, reg);
+    }
+}
+
+void hvm_set_reg(struct vcpu *v, unsigned int reg, uint64_t val)
+{
+    ASSERT(v == current || !vcpu_runnable(v));
+
+    switch ( reg )
+    {
+    default:
+        return alternative_vcall(hvm_funcs.set_reg, v, reg, val);
+    }
+}
+
 static bool is_sysdesc_access(const struct x86_emulate_state *state,
                               const struct x86_emulate_ctxt *ctxt)
 {
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -2469,6 +2469,33 @@ static bool svm_get_pending_event(struct
     return true;
 }
 
+static uint64_t svm_get_reg(struct vcpu *v, unsigned int reg)
+{
+    struct domain *d = v->domain;
+
+    switch ( reg )
+    {
+    default:
+        printk(XENLOG_G_ERR "%s(%pv, 0x%08x) Bad register\n",
+               __func__, v, reg);
+        domain_crash(d);
+        return 0;
+    }
+}
+
+static void svm_set_reg(struct vcpu *v, unsigned int reg, uint64_t val)
+{
+    struct domain *d = v->domain;
+
+    switch ( reg )
+    {
+    default:
+        printk(XENLOG_G_ERR "%s(%pv, 0x%08x, 0x%016"PRIx64") Bad register\n",
+               __func__, v, reg, val);
+        domain_crash(d);
+    }
+}
+
 static struct hvm_function_table __initdata svm_function_table = {
     .name                 = "SVM",
     .cpu_up_prepare       = svm_cpu_up_prepare,
@@ -2518,6 +2545,9 @@ static struct hvm_function_table __initd
     .nhvm_intr_blocked = nsvm_intr_blocked,
     .nhvm_hap_walk_L1_p2m = nsvm_hap_walk_L1_p2m,
 
+    .get_reg = svm_get_reg,
+    .set_reg = svm_set_reg,
+
     .tsc_scaling = {
         .max_ratio = ~TSC_RATIO_RSVD_BITS,
     },
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2404,6 +2404,33 @@ static int vmtrace_reset(struct vcpu *v)
     return 0;
 }
 
+static uint64_t vmx_get_reg(struct vcpu *v, unsigned int reg)
+{
+    struct domain *d = v->domain;
+
+    switch ( reg )
+    {
+    default:
+        printk(XENLOG_G_ERR "%s(%pv, 0x%08x) Bad register\n",
+               __func__, v, reg);
+        domain_crash(d);
+        return 0;
+    }
+}
+
+static void vmx_set_reg(struct vcpu *v, unsigned int reg, uint64_t val)
+{
+    struct domain *d = v->domain;
+
+    switch ( reg )
+    {
+    default:
+        printk(XENLOG_G_ERR "%s(%pv, 0x%08x, 0x%016"PRIx64") Bad register\n",
+               __func__, v, reg, val);
+        domain_crash(d);
+    }
+}
+
 static struct hvm_function_table __initdata vmx_function_table = {
     .name                 = "VMX",
     .cpu_up_prepare       = vmx_cpu_up_prepare,
@@ -2464,6 +2491,10 @@ static struct hvm_function_table __initd
     .vmtrace_set_option = vmtrace_set_option,
     .vmtrace_get_option = vmtrace_get_option,
     .vmtrace_reset = vmtrace_reset,
+
+    .get_reg = vmx_get_reg,
+    .set_reg = vmx_set_reg,
+
     .tsc_scaling = {
         .max_ratio = VMX_TSC_MULTIPLIER_MAX,
     },
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -222,6 +222,9 @@ struct hvm_function_table {
     int (*vmtrace_get_option)(struct vcpu *v, uint64_t key, uint64_t *value);
     int (*vmtrace_reset)(struct vcpu *v);
 
+    uint64_t (*get_reg)(struct vcpu *v, unsigned int reg);
+    void (*set_reg)(struct vcpu *v, unsigned int reg, uint64_t val);
+
     /*
      * Parameters and callbacks for hardware-assisted TSC scaling,
      * which are valid only when the hardware feature is available.
@@ -728,6 +731,18 @@ static inline int hvm_vmtrace_reset(stru
 }
 
 /*
+ * Accessors for registers which have per-guest-type or per-vendor locations
+ * (e.g. VMCS, msr load/save lists, VMCB, VMLOAD lazy, etc).
+ *
+ * The caller is responsible for all auditing - these accessors do not fail,
+ * but do use domain_crash() for usage errors.
+ *
+ * Must cope with being called in non-current context.
+ */
+uint64_t hvm_get_reg(struct vcpu *v, unsigned int reg);
+void hvm_set_reg(struct vcpu *v, unsigned int reg, uint64_t val);
+
+/*
  * This must be defined as a macro instead of an inline function,
  * because it uses 'struct vcpu' and 'struct domain' which have
  * not been defined yet.
--- a/xen/include/asm-x86/pv/domain.h
+++ b/xen/include/asm-x86/pv/domain.h
@@ -65,6 +65,10 @@ static inline unsigned long get_pcid_bit
 #endif
 }
 
+/* See hvm_{get,set}_reg() for description. */
+uint64_t pv_get_reg(struct vcpu *v, unsigned int reg);
+void pv_set_reg(struct vcpu *v, unsigned int reg, uint64_t val);
+
 #ifdef CONFIG_PV
 
 void pv_vcpu_destroy(struct vcpu *v);
--- a/xen/arch/x86/pv/emulate.c
+++ b/xen/arch/x86/pv/emulate.c
@@ -90,6 +90,37 @@ void pv_emul_instruction_done(struct cpu
     }
 }
 
+uint64_t pv_get_reg(struct vcpu *v, unsigned int reg)
+{
+    struct domain *d = v->domain;
+
+    ASSERT(v == current || !vcpu_runnable(v));
+
+    switch ( reg )
+    {
+    default:
+        printk(XENLOG_G_ERR "%s(%pv, 0x%08x) Bad register\n",
+               __func__, v, reg);
+        domain_crash(d);
+        return 0;
+    }
+}
+
+void pv_set_reg(struct vcpu *v, unsigned int reg, uint64_t val)
+{
+    struct domain *d = v->domain;
+
+    ASSERT(v == current || !vcpu_runnable(v));
+
+    switch ( reg )
+    {
+    default:
+        printk(XENLOG_G_ERR "%s(%pv, 0x%08x, 0x%016"PRIx64") Bad register\n",
+               __func__, v, reg, val);
+        domain_crash(d);
+    }
+}
+
 /*
  * Local variables:
  * mode: C

++++++ 61e98e89-x86-MSR-split-SPEC_CTRL-handling.patch ++++++
# Commit 6536688439dbca1d08fd6db5be29c39e3917fb2f
# Date 2022-01-20 16:32:11 +0000
# Author Andrew Cooper <[email protected]>
# Committer Andrew Cooper <[email protected]>
x86/msr: Split MSR_SPEC_CTRL handling

In order to fix a VT-x bug, and support MSR_SPEC_CTRL on AMD, move
MSR_SPEC_CTRL handling into the new {pv,hvm}_{get,set}_reg() infrastructure.

Duplicate the msrs->spec_ctrl.raw accesses in the PV and VT-x paths for now.
The SVM path is currently unreachable because of the CPUID policy.

No functional change.

Signed-off-by: Andrew Cooper <[email protected]>
Reviewed-by: Jan Beulich <[email protected]>

--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2410,6 +2410,9 @@ static uint64_t vmx_get_reg(struct vcpu
 
     switch ( reg )
     {
+    case MSR_SPEC_CTRL:
+        return v->arch.msrs->spec_ctrl.raw;
+
     default:
         printk(XENLOG_G_ERR "%s(%pv, 0x%08x) Bad register\n",
                __func__, v, reg);
@@ -2424,6 +2427,10 @@ static void vmx_set_reg(struct vcpu *v,
 
     switch ( reg )
     {
+    case MSR_SPEC_CTRL:
+        v->arch.msrs->spec_ctrl.raw = val;
+        break;
+
     default:
         printk(XENLOG_G_ERR "%s(%pv, 0x%08x, 0x%016"PRIx64") Bad register\n",
                __func__, v, reg, val);
--- a/xen/arch/x86/msr.c
+++ b/xen/arch/x86/msr.c
@@ -28,6 +28,7 @@
 #include <asm/hvm/nestedhvm.h>
 #include <asm/hvm/viridian.h>
 #include <asm/msr.h>
+#include <asm/pv/domain.h>
 #include <asm/setup.h>
 
 #include <public/hvm/params.h>
@@ -265,8 +266,7 @@ int guest_rdmsr(struct vcpu *v, uint32_t
     case MSR_SPEC_CTRL:
         if ( !cp->feat.ibrsb )
             goto gp_fault;
-        *val = msrs->spec_ctrl.raw;
-        break;
+        goto get_reg;
 
     case MSR_INTEL_PLATFORM_INFO:
         *val = mp->platform_info.raw;
@@ -424,6 +424,13 @@ int guest_rdmsr(struct vcpu *v, uint32_t
 
     return ret;
 
+ get_reg: /* Delegate register access to per-vm-type logic. */
+    if ( is_pv_domain(d) )
+        *val = pv_get_reg(v, msr);
+    else
+        *val = hvm_get_reg(v, msr);
+    return X86EMUL_OKAY;
+
  gp_fault:
     return X86EMUL_EXCEPTION;
 }
@@ -513,9 +520,7 @@ int guest_wrmsr(struct vcpu *v, uint32_t
 
         if ( val & rsvd )
             goto gp_fault; /* Rsvd bit set? */
-
-        msrs->spec_ctrl.raw = val;
-        break;
+        goto set_reg;
 
     case MSR_PRED_CMD:
         if ( !cp->feat.ibrsb && !cp->extd.ibpb )
@@ -663,6 +668,13 @@ int guest_wrmsr(struct vcpu *v, uint32_t
 
     return ret;
 
+ set_reg: /* Delegate register access to per-vm-type logic. */
+    if ( is_pv_domain(d) )
+        pv_set_reg(v, msr, val);
+    else
+        hvm_set_reg(v, msr, val);
+    return X86EMUL_OKAY;
+
  gp_fault:
     return X86EMUL_EXCEPTION;
 }
--- a/xen/arch/x86/pv/emulate.c
+++ b/xen/arch/x86/pv/emulate.c
@@ -92,12 +92,16 @@ void pv_emul_instruction_done(struct cpu
 
 uint64_t pv_get_reg(struct vcpu *v, unsigned int reg)
 {
+    const struct vcpu_msrs *msrs = v->arch.msrs;
     struct domain *d = v->domain;
 
     ASSERT(v == current || !vcpu_runnable(v));
 
     switch ( reg )
     {
+    case MSR_SPEC_CTRL:
+        return msrs->spec_ctrl.raw;
+
     default:
         printk(XENLOG_G_ERR "%s(%pv, 0x%08x) Bad register\n",
                __func__, v, reg);
@@ -108,12 +112,17 @@ uint64_t pv_get_reg(struct vcpu *v, unsi
 
 void pv_set_reg(struct vcpu *v, unsigned int reg, uint64_t val)
 {
+    struct vcpu_msrs *msrs = v->arch.msrs;
     struct domain *d = v->domain;
 
     ASSERT(v == current || !vcpu_runnable(v));
 
     switch ( reg )
     {
+    case MSR_SPEC_CTRL:
+        msrs->spec_ctrl.raw = val;
+        break;
+
     default:
         printk(XENLOG_G_ERR "%s(%pv, 0x%08x, 0x%016"PRIx64") Bad register\n",
                __func__, v, reg, val);

++++++ 61e98e8a-x86-spec-ctrl-drop-ENTRY-EXIT-HVM.patch ++++++
# Commit 95b13fa43e0753b7514bef13abe28253e8614f62
# Date 2022-01-20 16:32:11 +0000
# Author Andrew Cooper <[email protected]>
# Committer Andrew Cooper <[email protected]>
x86/spec-ctrl: Drop SPEC_CTRL_{ENTRY_FROM,EXIT_TO}_HVM

These were written before Spectre/Meltdown went public, and there was large
uncertainty in how the protections would evolve.  As it turns out, they're
very specific to Intel hardware, and not very suitable for AMD.

Drop the macros, opencoding the relevant subset of functionality, and leaving
grep-fodder to locate the logic.  No change at all for VT-x.

For AMD, the only relevant piece of functionality is DO_OVERWRITE_RSB,
although we will soon be adding (different) logic to handle MSR_SPEC_CTRL.

This has a marginal improvement of removing an unconditional pile of long-nops
from the vmentry/exit path.

Signed-off-by: Andrew Cooper <[email protected]>
Reviewed-by: Roger Pau Monn?? <[email protected]>

--- a/xen/arch/x86/hvm/svm/entry.S
+++ b/xen/arch/x86/hvm/svm/entry.S
@@ -59,7 +59,7 @@ __UNLIKELY_END(nsvm_hap)
         mov VCPUMSR_spec_ctrl_raw(%rax), %eax
 
         /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */
-        SPEC_CTRL_EXIT_TO_HVM   /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: 
cd */
+        /* SPEC_CTRL_EXIT_TO_SVM   (nothing currently) */
 
         pop  %r15
         pop  %r14
@@ -86,7 +86,8 @@ __UNLIKELY_END(nsvm_hap)
 
         GET_CURRENT(bx)
 
-        SPEC_CTRL_ENTRY_FROM_HVM    /* Req: b=curr %rsp=regs/cpuinfo, Clob: 
acd */
+        /* SPEC_CTRL_ENTRY_FROM_SVM    Req: b=curr %rsp=regs/cpuinfo, Clob: ac 
 */
+        ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_HVM
         /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
 
         stgi
--- a/xen/arch/x86/hvm/vmx/entry.S
+++ b/xen/arch/x86/hvm/vmx/entry.S
@@ -33,7 +33,9 @@ ENTRY(vmx_asm_vmexit_handler)
         movb $1,VCPU_vmx_launched(%rbx)
         mov  %rax,VCPU_hvm_guest_cr2(%rbx)
 
-        SPEC_CTRL_ENTRY_FROM_HVM    /* Req: b=curr %rsp=regs/cpuinfo, Clob: 
acd */
+        /* SPEC_CTRL_ENTRY_FROM_VMX    Req: b=curr %rsp=regs/cpuinfo, Clob: 
acd */
+        ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_HVM
+        ALTERNATIVE "", DO_SPEC_CTRL_ENTRY_FROM_HVM, X86_FEATURE_SC_MSR_HVM
         /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
 
         /* Hardware clears MSR_DEBUGCTL on VMExit.  Reinstate it if debugging 
Xen. */
@@ -80,7 +82,9 @@ UNLIKELY_END(realmode)
         mov VCPUMSR_spec_ctrl_raw(%rax), %eax
 
         /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */
-        SPEC_CTRL_EXIT_TO_HVM   /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: 
cd */
+        /* SPEC_CTRL_EXIT_TO_VMX   Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: 
cd */
+        ALTERNATIVE "", DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_HVM
+        ALTERNATIVE "", __stringify(verw CPUINFO_verw_sel(%rsp)), 
X86_FEATURE_SC_VERW_HVM
 
         mov  VCPU_hvm_guest_cr2(%rbx),%rax
 
--- a/xen/include/asm-x86/spec_ctrl_asm.h
+++ b/xen/include/asm-x86/spec_ctrl_asm.h
@@ -68,14 +68,16 @@
  *
  * The following ASM fragments implement this algorithm.  See their local
  * comments for further details.
- *  - SPEC_CTRL_ENTRY_FROM_HVM
  *  - SPEC_CTRL_ENTRY_FROM_PV
  *  - SPEC_CTRL_ENTRY_FROM_INTR
  *  - SPEC_CTRL_ENTRY_FROM_INTR_IST
  *  - SPEC_CTRL_EXIT_TO_XEN_IST
  *  - SPEC_CTRL_EXIT_TO_XEN
  *  - SPEC_CTRL_EXIT_TO_PV
- *  - SPEC_CTRL_EXIT_TO_HVM
+ *
+ * Additionally, the following grep-fodder exists to find the HVM logic.
+ *  - SPEC_CTRL_ENTRY_FROM_{SVM,VMX}
+ *  - SPEC_CTRL_EXIT_TO_{SVM,VMX}
  */
 
 .macro DO_OVERWRITE_RSB tmp=rax
@@ -225,12 +227,6 @@
     wrmsr
 .endm
 
-/* Use after a VMEXIT from an HVM guest. */
-#define SPEC_CTRL_ENTRY_FROM_HVM                                        \
-    ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_HVM;           \
-    ALTERNATIVE "", DO_SPEC_CTRL_ENTRY_FROM_HVM,                        \
-        X86_FEATURE_SC_MSR_HVM
-
 /* Use after an entry from PV context (syscall/sysenter/int80/int82/etc). */
 #define SPEC_CTRL_ENTRY_FROM_PV                                         \
     ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV;            \
@@ -255,13 +251,6 @@
     ALTERNATIVE "", __stringify(verw CPUINFO_verw_sel(%rsp)),           \
         X86_FEATURE_SC_VERW_PV
 
-/* Use when exiting to HVM guest context. */
-#define SPEC_CTRL_EXIT_TO_HVM                                           \
-    ALTERNATIVE "",                                                     \
-        DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_HVM;             \
-    ALTERNATIVE "", __stringify(verw CPUINFO_verw_sel(%rsp)),           \
-        X86_FEATURE_SC_VERW_HVM
-
 /*
  * Use in IST interrupt/exception context.  May interrupt Xen or PV context.
  * Fine grain control of SCF_ist_wrmsr is needed for safety in the S3 resume

++++++ 61e98e8b-VT-x-SPEC_CTRL-NMI-race-condition.patch ++++++
# Commit 81f0eaadf84d273a6ff8df3660b874a02d0e7677
# Date 2022-01-20 16:32:11 +0000
# Author Andrew Cooper <[email protected]>
# Committer Andrew Cooper <[email protected]>
x86/spec-ctrl: Fix NMI race condition with VT-x MSR_SPEC_CTRL handling

The logic was based on a mistaken understanding of how NMI blocking on vmexit
works.  NMIs are only blocked for EXIT_REASON_NMI, and not for general exits.
Therefore, an NMI can in general hit early in the vmx_asm_vmexit_handler path,
and the guest's value will be clobbered before it is saved.

Switch to using MSR load/save lists.  This causes the guest value to be saved
atomically with respect to NMIs/MCEs/etc.

First, update vmx_cpuid_policy_changed() to configure the load/save lists at
the same time as configuring the intercepts.  This function is always used in
remote context, so extend the vmx_vmcs_{enter,exit}() block to cover the whole
function, rather than having multiple remote acquisitions of the same VMCS.

Both of vmx_{add,del}_guest_msr() can fail.  The -ESRCH delete case is fine,
but all others are fatal to the running of the VM, so handle them using
domain_crash() - this path is only used during domain construction anyway.

Second, update vmx_{get,set}_reg() to use the MSR load/save lists rather than
vcpu_msrs, and update the vcpu_msrs comment to describe the new state
location.

Finally, adjust the entry/exit asm.

Because the guest value is saved and loaded atomically, we do not need to
manually load the guest value, nor do we need to enable SCF_use_shadow.  This
lets us remove the use of DO_SPEC_CTRL_EXIT_TO_GUEST.  Additionally,
SPEC_CTRL_ENTRY_FROM_PV gets removed too, because on an early entry failure,
we're no longer in the guest MSR_SPEC_CTRL context needing to switch back to
Xen's context.

The only action remaining is to load Xen's MSR_SPEC_CTRL value on vmexit.  We
could in principle use the host msr list, but is expected to complicated
future work.  Delete DO_SPEC_CTRL_ENTRY_FROM_HVM entirely, and use a shorter
code sequence to simply reload Xen's setting from the top-of-stack block.

Adjust the comment at the top of spec_ctrl_asm.h in light of this bugfix.

Signed-off-by: Andrew Cooper <[email protected]>
Reviewed-by: Jan Beulich <[email protected]>

--- a/xen/arch/x86/hvm/vmx/entry.S
+++ b/xen/arch/x86/hvm/vmx/entry.S
@@ -35,7 +35,14 @@ ENTRY(vmx_asm_vmexit_handler)
 
         /* SPEC_CTRL_ENTRY_FROM_VMX    Req: b=curr %rsp=regs/cpuinfo, Clob: 
acd */
         ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_HVM
-        ALTERNATIVE "", DO_SPEC_CTRL_ENTRY_FROM_HVM, X86_FEATURE_SC_MSR_HVM
+
+        .macro restore_spec_ctrl
+            mov    $MSR_SPEC_CTRL, %ecx
+            movzbl CPUINFO_xen_spec_ctrl(%rsp), %eax
+            xor    %edx, %edx
+            wrmsr
+        .endm
+        ALTERNATIVE "", restore_spec_ctrl, X86_FEATURE_SC_MSR_HVM
         /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
 
         /* Hardware clears MSR_DEBUGCTL on VMExit.  Reinstate it if debugging 
Xen. */
@@ -82,8 +89,7 @@ UNLIKELY_END(realmode)
         mov VCPUMSR_spec_ctrl_raw(%rax), %eax
 
         /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */
-        /* SPEC_CTRL_EXIT_TO_VMX   Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: 
cd */
-        ALTERNATIVE "", DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_HVM
+        /* SPEC_CTRL_EXIT_TO_VMX   Req: %rsp=regs/cpuinfo              Clob:   
 */
         ALTERNATIVE "", __stringify(verw CPUINFO_verw_sel(%rsp)), 
X86_FEATURE_SC_VERW_HVM
 
         mov  VCPU_hvm_guest_cr2(%rbx),%rax
@@ -119,12 +125,12 @@ UNLIKELY_END(realmode)
         SAVE_ALL
 
         /*
-         * PV variant needed here as no guest code has executed (so
-         * MSR_SPEC_CTRL can't have changed value), and NMIs/MCEs are liable
-         * to hit (in which case the HVM variant might corrupt things).
+         * SPEC_CTRL_ENTRY notes
+         *
+         * If we end up here, no guest code has executed.  The MSR lists have
+         * not been processed, so we still have Xen's choice of MSR_SPEC_CTRL
+         * in context, and the RSB is unchanged.
          */
-        SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo Clob: acd */
-        /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
 
         call vmx_vmentry_failure
         jmp  .Lvmx_process_softirqs
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -592,6 +592,7 @@ void vmx_update_exception_bitmap(struct
 static void vmx_cpuid_policy_changed(struct vcpu *v)
 {
     const struct cpuid_policy *cp = v->domain->arch.cpuid;
+    int rc = 0;
 
     if ( opt_hvm_fep ||
          (v->domain->arch.cpuid->x86_vendor != boot_cpu_data.x86_vendor) )
@@ -601,17 +602,29 @@ static void vmx_cpuid_policy_changed(str
 
     vmx_vmcs_enter(v);
     vmx_update_exception_bitmap(v);
-    vmx_vmcs_exit(v);
 
     /*
      * We can safely pass MSR_SPEC_CTRL through to the guest, even if STIBP
      * isn't enumerated in hardware, as SPEC_CTRL_STIBP is ignored.
      */
     if ( cp->feat.ibrsb )
+    {
         vmx_clear_msr_intercept(v, MSR_SPEC_CTRL, VMX_MSR_RW);
+
+        rc = vmx_add_guest_msr(v, MSR_SPEC_CTRL, 0);
+        if ( rc )
+            goto out;
+    }
     else
+    {
         vmx_set_msr_intercept(v, MSR_SPEC_CTRL, VMX_MSR_RW);
 
+        rc = vmx_del_msr(v, MSR_SPEC_CTRL, VMX_MSR_GUEST);
+        if ( rc && rc != -ESRCH )
+            goto out;
+        rc = 0; /* Tolerate -ESRCH */
+    }
+
     /* MSR_PRED_CMD is safe to pass through if the guest knows about it. */
     if ( cp->feat.ibrsb || cp->extd.ibpb )
         vmx_clear_msr_intercept(v, MSR_PRED_CMD,  VMX_MSR_RW);
@@ -623,6 +636,15 @@ static void vmx_cpuid_policy_changed(str
         vmx_clear_msr_intercept(v, MSR_FLUSH_CMD, VMX_MSR_RW);
     else
         vmx_set_msr_intercept(v, MSR_FLUSH_CMD, VMX_MSR_RW);
+
+ out:
+    vmx_vmcs_exit(v);
+
+    if ( rc )
+    {
+        printk(XENLOG_G_ERR "%pv MSR list error: %d", v, rc);
+        domain_crash(v->domain);
+    }
 }
 
 int vmx_guest_x86_mode(struct vcpu *v)
@@ -2407,11 +2429,20 @@ static int vmtrace_reset(struct vcpu *v)
 static uint64_t vmx_get_reg(struct vcpu *v, unsigned int reg)
 {
     struct domain *d = v->domain;
+    uint64_t val = 0;
+    int rc;
 
     switch ( reg )
     {
     case MSR_SPEC_CTRL:
-        return v->arch.msrs->spec_ctrl.raw;
+        rc = vmx_read_guest_msr(v, reg, &val);
+        if ( rc )
+        {
+            printk(XENLOG_G_ERR "%s(%pv, 0x%08x) MSR list error: %d\n",
+                   __func__, v, reg, rc);
+            domain_crash(d);
+        }
+        return val;
 
     default:
         printk(XENLOG_G_ERR "%s(%pv, 0x%08x) Bad register\n",
@@ -2424,11 +2455,18 @@ static uint64_t vmx_get_reg(struct vcpu
 static void vmx_set_reg(struct vcpu *v, unsigned int reg, uint64_t val)
 {
     struct domain *d = v->domain;
+    int rc;
 
     switch ( reg )
     {
     case MSR_SPEC_CTRL:
-        v->arch.msrs->spec_ctrl.raw = val;
+        rc = vmx_write_guest_msr(v, reg, val);
+        if ( rc )
+        {
+            printk(XENLOG_G_ERR "%s(%pv, 0x%08x) MSR list error: %d\n",
+                   __func__, v, reg, rc);
+            domain_crash(d);
+        }
         break;
 
     default:
--- a/xen/include/asm-x86/msr.h
+++ b/xen/include/asm-x86/msr.h
@@ -287,7 +287,15 @@ extern struct msr_policy     raw_msr_pol
 /* Container object for per-vCPU MSRs */
 struct vcpu_msrs
 {
-    /* 0x00000048 - MSR_SPEC_CTRL */
+    /*
+     * 0x00000048 - MSR_SPEC_CTRL
+     *
+     * For PV guests, this holds the guest kernel value.  It is accessed on
+     * every entry/exit path.
+     *
+     * For VT-x guests, the guest value is held in the MSR guest load/save
+     * list.
+     */
     struct {
         uint32_t raw;
     } spec_ctrl;
--- a/xen/include/asm-x86/spec_ctrl_asm.h
+++ b/xen/include/asm-x86/spec_ctrl_asm.h
@@ -42,9 +42,10 @@
  *     path, or late in the exit path after restoring the guest value.  This
  *     will corrupt the guest value.
  *
- * Factor 1 is dealt with by relying on NMIs/MCEs being blocked immediately
- * after VMEXIT.  The VMEXIT-specific code reads MSR_SPEC_CTRL and updates
- * current before loading Xen's MSR_SPEC_CTRL setting.
+ * Factor 1 is dealt with:
+ *   - On VMX by using MSR load/save lists to have vmentry/exit atomically
+ *     load/save the guest value.  Xen's value is loaded in regular code, and
+ *     there is no need to use the shadow logic (below).
  *
  * Factor 2 is harder.  We maintain a shadow_spec_ctrl value, and a use_shadow
  * boolean in the per cpu spec_ctrl_flags.  The synchronous use is:
@@ -128,31 +129,6 @@
 #endif
 .endm
 
-.macro DO_SPEC_CTRL_ENTRY_FROM_HVM
-/*
- * Requires %rbx=current, %rsp=regs/cpuinfo
- * Clobbers %rax, %rcx, %rdx
- *
- * The common case is that a guest has direct access to MSR_SPEC_CTRL, at
- * which point we need to save the guest value before setting IBRS for Xen.
- * Unilaterally saving the guest value is shorter and faster than checking.
- */
-    mov $MSR_SPEC_CTRL, %ecx
-    rdmsr
-
-    /* Stash the value from hardware. */
-    mov VCPU_arch_msrs(%rbx), %rdx
-    mov %eax, VCPUMSR_spec_ctrl_raw(%rdx)
-    xor %edx, %edx
-
-    /* Clear SPEC_CTRL shadowing *before* loading Xen's value. */
-    andb $~SCF_use_shadow, CPUINFO_spec_ctrl_flags(%rsp)
-
-    /* Load Xen's intended value. */
-    movzbl CPUINFO_xen_spec_ctrl(%rsp), %eax
-    wrmsr
-.endm
-
 .macro DO_SPEC_CTRL_ENTRY maybexen:req
 /*
  * Requires %rsp=regs (also cpuinfo if !maybexen)

++++++ 61eaaa23-x86-get-set-reg-infra-build.patch ++++++
# Commit 13caa585791234fe3e3719c8376f7ea731012451
# Date 2022-01-21 12:42:11 +0000
# Author Andrew Cooper <[email protected]>
# Committer Andrew Cooper <[email protected]>
x86: Fix build with the get/set_reg() infrastructure

I clearly messed up concluding that the stubs were safe to drop.

The is_{pv,hvm}_domain() predicates are not symmetrical with both CONFIG_PV
and CONFIG_HVM.  As a result logic of the form `if ( pv/hvm ) ... else ...`
will always have one side which can't be DCE'd.

While technically only the hvm stubs are needed, due to the use of the
is_pv_domain() predicate in guest_{rd,wr}msr(), sort out the pv stubs too to
avoid leaving a bear trap for future users.

Fixes: 88d3ff7ab15d ("x86/guest: Introduce {get,set}_reg() infrastructure")
Signed-off-by: Andrew Cooper <[email protected]>
Acked-by: Jan Beulich <[email protected]>

--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -865,6 +865,16 @@ static inline int hvm_vmtrace_get_option
     return -EOPNOTSUPP;
 }
 
+static inline uint64_t hvm_get_reg(struct vcpu *v, unsigned int reg)
+{
+    ASSERT_UNREACHABLE();
+    return 0;
+}
+static inline void hvm_set_reg(struct vcpu *v, unsigned int reg, uint64_t val)
+{
+    ASSERT_UNREACHABLE();
+}
+
 #define is_viridian_domain(d) ((void)(d), false)
 #define is_viridian_vcpu(v) ((void)(v), false)
 #define has_viridian_time_ref_count(d) ((void)(d), false)
--- a/xen/include/asm-x86/pv/domain.h
+++ b/xen/include/asm-x86/pv/domain.h
@@ -65,10 +65,6 @@ static inline unsigned long get_pcid_bit
 #endif
 }
 
-/* See hvm_{get,set}_reg() for description. */
-uint64_t pv_get_reg(struct vcpu *v, unsigned int reg);
-void pv_set_reg(struct vcpu *v, unsigned int reg, uint64_t val);
-
 #ifdef CONFIG_PV
 
 void pv_vcpu_destroy(struct vcpu *v);
@@ -93,6 +89,10 @@ unsigned long pv_fixup_guest_cr4(const s
 /* Create a cr4 value to load into hardware, based on vcpu settings. */
 unsigned long pv_make_cr4(const struct vcpu *v);
 
+/* See hvm_{get,set}_reg() for description. */
+uint64_t pv_get_reg(struct vcpu *v, unsigned int reg);
+void pv_set_reg(struct vcpu *v, unsigned int reg, uint64_t val);
+
 bool xpti_pcid_enabled(void);
 
 #else  /* !CONFIG_PV */
@@ -106,6 +106,16 @@ static inline int pv_domain_initialise(s
 
 static inline unsigned long pv_make_cr4(const struct vcpu *v) { return ~0ul; }
 
+static inline uint64_t pv_get_reg(struct vcpu *v, unsigned int reg)
+{
+    ASSERT_UNREACHABLE();
+    return 0;
+}
+static inline void pv_set_reg(struct vcpu *v, unsigned int reg, uint64_t val)
+{
+    ASSERT_UNREACHABLE();
+}
+
 #endif /* CONFIG_PV */
 
 void paravirt_ctxt_switch_from(struct vcpu *v);

++++++ 61efec1d-Arm-P2M-always-clear-entry-on-mapping-removal.patch ++++++
# Commit a428b913a002eb2b7425b48029c20a52eeee1b5a
# Date 2022-01-25 13:25:01 +0100
# Author Julien Grall <[email protected]>
# Committer Jan Beulich <[email protected]>
xen/arm: p2m: Always clear the P2M entry when the mapping is removed

Commit 2148a125b73b ("xen/arm: Track page accessed between batch of
Set/Way operations") allowed an entry to be invalid from the CPU PoV
(lpae_is_valid()) but valid for Xen (p2m_is_valid()). This is useful
to track which page is accessed and only perform an action on them
(e.g. clean & invalidate the cache after a set/way instruction).

Unfortunately, __p2m_set_entry() is only zeroing the P2M entry when
lpae_is_valid() returns true. This means the entry will not be zeroed
if the entry was valid from Xen PoV but invalid from the CPU PoV for
tracking purpose.

As a consequence, this will allow a domain to continue to access the
page after it was removed.

Resolve the issue by always zeroing the entry if it the LPAE bit is
set or the entry is about to be removed.

This is CVE-2022-23033 / XSA-393.

Reported-by: Dmytro Firsov <[email protected]>
Fixes: 2148a125b73b ("xen/arm: Track page accessed between batch of Set/Way 
operations")
Reviewed-by: Stefano Stabellini <[email protected]>
Signed-off-by: Julien Grall <[email protected]>

--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -1016,7 +1016,7 @@ static int __p2m_set_entry(struct p2m_do
      * sequence when updating the translation table (D4.7.1 in ARM DDI
      * 0487A.j).
      */
-    if ( lpae_is_valid(orig_pte) )
+    if ( lpae_is_valid(orig_pte) || removing_mapping )
         p2m_remove_pte(entry, p2m->clean_pte);
 
     if ( removing_mapping )

++++++ 61efec4d-gnttab-only-decrement-refcounter-on-final-unmap.patch ++++++
# Commit 975a8fb45ca186b3476e5656c6ad5dad1122dbfd
# Date 2022-01-25 13:25:49 +0100
# Author Julien Grall <[email protected]>
# Committer Jan Beulich <[email protected]>
xen/grant-table: Only decrement the refcounter when grant is fully unmapped

The grant unmapping hypercall (GNTTABOP_unmap_grant_ref) is not a
simple revert of the changes done by the grant mapping hypercall
(GNTTABOP_map_grant_ref).

Instead, it is possible to partially (or even not) clear some flags.
This will leave the grant is mapped until a future call where all
the flags would be cleared.

XSA-380 introduced a refcounting that is meant to only be dropped
when the grant is fully unmapped. Unfortunately, unmap_common() will
decrement the refcount for every successful call.

A consequence is a domain would be able to underflow the refcount
and trigger a BUG().

Looking at the code, it is not clear to me why a domain would
want to partially clear some flags in the grant-table. But as
this is part of the ABI, it is better to not change the behavior
for now.

Fix it by checking if the maptrack handle has been released before
decrementing the refcounting.

This is CVE-2022-23034 / XSA-394.

Fixes: 9781b51efde2 ("gnttab: replace mapkind()")
Signed-off-by: Julien Grall <[email protected]>
Reviewed-by: Jan Beulich <[email protected]>

--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -1488,8 +1488,15 @@ unmap_common(
     if ( put_handle )
         put_maptrack_handle(lgt, op->handle);
 
-    /* See the respective comment in map_grant_ref(). */
-    if ( rc == GNTST_okay && ld != rd && gnttab_need_iommu_mapping(ld) )
+    /*
+     * map_grant_ref() will only increment the refcount (and update the
+     * IOMMU) once per mapping. So we only want to decrement it once the
+     * maptrack handle has been put, alongside the further IOMMU update.
+     *
+     * For the second and third check, see the respective comment in
+     * map_grant_ref().
+     */
+    if ( put_handle && ld != rd && gnttab_need_iommu_mapping(ld) )
     {
         void **slot;
         union maptrack_node node;

++++++ 61efec96-IOMMU-x86-stop-pirq-iteration-immediately-on-error.patch ++++++
# Commit 9480a1a519cf016623f657dc544cb372a82b5708
# Date 2022-01-25 13:27:02 +0100
# Author Julien Grall <[email protected]>
# Committer Jan Beulich <[email protected]>
passthrough/x86: stop pirq iteration immediately in case of error

pt_pirq_iterate() will iterate in batch over all the PIRQs. The outer
loop will bail out if 'rc' is non-zero but the inner loop will continue.

This means 'rc' will get clobbered and we may miss any errors (such as
-ERESTART in the case of the callback pci_clean_dpci_irq()).

This is CVE-2022-23035 / XSA-395.

Fixes: c24536b636f2 ("replace d->nr_pirqs sized arrays with radix tree")
Fixes: f6dd295381f4 ("dpci: replace tasklet with softirq")
Signed-off-by: Julien Grall <[email protected]>
Signed-off-by: Jan Beulich <[email protected]>
Reviewed-by: Roger Pau Monn?? <[email protected]>

--- a/xen/drivers/passthrough/x86/hvm.c
+++ b/xen/drivers/passthrough/x86/hvm.c
@@ -732,7 +732,11 @@ int pt_pirq_iterate(struct domain *d,
 
             pirq = pirqs[i]->pirq;
             if ( (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) )
+            {
                 rc = cb(d, pirq_dpci, arg);
+                if ( rc )
+                    break;
+            }
         }
     } while ( !rc && ++pirq < d->nr_pirqs && n == ARRAY_SIZE(pirqs) );
 

++++++ 61f2d886-x86-CPUID-disentangle-new-leaves-logic.patch ++++++
# Commit e3662437eb43cc8002bd39be077ef68b131649c5
# Date 2022-01-27 17:38:15 +0000
# Author Andrew Cooper <[email protected]>
# Committer Andrew Cooper <[email protected]>
x86/cpuid: Disentangle logic for new feature leaves

Adding a new feature leaf is a reasonable amount of boilerplate and for the
patch to build, at least one feature from the new leaf needs defining.  This
typically causes two non-trivial changes to be merged together.

First, have gen-cpuid.py write out some extra placeholder defines:

  #define CPUID_BITFIELD_11 bool :1, :1, lfence_dispatch:1, ...
  #define CPUID_BITFIELD_12 uint32_t :32 /* placeholder */
  #define CPUID_BITFIELD_13 uint32_t :32 /* placeholder */
  #define CPUID_BITFIELD_14 uint32_t :32 /* placeholder */
  #define CPUID_BITFIELD_15 uint32_t :32 /* placeholder */

This allows DECL_BITFIELD() to be added to struct cpuid_policy without
requiring a XEN_CPUFEATURE() declared for the leaf.  The choice of 4 is
arbitrary, and allows us to add more than one leaf at a time if necessary.

Second, rework generic_identify() to not use specific feature names.

The choice of deriving the index from a feature was to avoid mismatches, but
its correctness depends on bugs like c/s 249e0f1d8f20 ("x86/cpuid: Fix
TSXLDTRK definition") not happening.

Switch to using FEATURESET_* just like the policy/featureset helpers.  This
breaks the cognitive complexity of needing to know which leaf a specifically
named feature should reside in, and is shorter to write.  It is also far
easier to identify as correct at a glance, given the correlation with the
CPUID leaf being read.

In addition, tidy up some other bits of generic_identify()
 * Drop leading zeros from leaf numbers.
 * Don't use a locked update for X86_FEATURE_APERFMPERF.
 * Rework extended_cpuid_level calculation to avoid setting it twice.
 * Use "leaf >= $N" consistently so $N matches with the CPUID input.

Signed-off-by: Andrew Cooper <[email protected]>
Reviewed-by: Jan Beulich <[email protected]>

--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -379,7 +379,7 @@ static void generic_identify(struct cpui
        u32 eax, ebx, ecx, edx, tmp;
 
        /* Get vendor name */
-       cpuid(0x00000000, &c->cpuid_level, &ebx, &ecx, &edx);
+       cpuid(0, &c->cpuid_level, &ebx, &ecx, &edx);
        *(u32 *)&c->x86_vendor_id[0] = ebx;
        *(u32 *)&c->x86_vendor_id[8] = ecx;
        *(u32 *)&c->x86_vendor_id[4] = edx;
@@ -394,7 +394,7 @@ static void generic_identify(struct cpui
        /* Note that the vendor-specific code below might override */
 
        /* Model and family information. */
-       cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
+       cpuid(1, &eax, &ebx, &ecx, &edx);
        c->x86 = get_cpu_family(eax, &c->x86_model, &c->x86_mask);
        c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
        c->phys_proc_id = c->apicid;
@@ -404,53 +404,53 @@ static void generic_identify(struct cpui
 
        /* c_early_init() may have adjusted cpuid levels/features.  Reread. */
        c->cpuid_level = cpuid_eax(0);
-       cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
-       c->x86_capability[cpufeat_word(X86_FEATURE_FPU)] = edx;
-       c->x86_capability[cpufeat_word(X86_FEATURE_SSE3)] = ecx;
+       cpuid(1, &eax, &ebx,
+             &c->x86_capability[FEATURESET_1c],
+             &c->x86_capability[FEATURESET_1d]);
 
        if ( cpu_has(c, X86_FEATURE_CLFLUSH) )
                c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
 
        if ( (c->cpuid_level >= CPUID_PM_LEAF) &&
             (cpuid_ecx(CPUID_PM_LEAF) & CPUID6_ECX_APERFMPERF_CAPABILITY) )
-               set_bit(X86_FEATURE_APERFMPERF, c->x86_capability);
+               __set_bit(X86_FEATURE_APERFMPERF, c->x86_capability);
+
+       eax = cpuid_eax(0x80000000);
+       if ((eax >> 16) == 0x8000)
+               c->extended_cpuid_level = eax;
 
        /* AMD-defined flags: level 0x80000001 */
-       c->extended_cpuid_level = cpuid_eax(0x80000000);
-       if ((c->extended_cpuid_level >> 16) != 0x8000)
-               c->extended_cpuid_level = 0;
-       if (c->extended_cpuid_level > 0x80000000)
+       if (c->extended_cpuid_level >= 0x80000001)
                cpuid(0x80000001, &tmp, &tmp,
-                     &c->x86_capability[cpufeat_word(X86_FEATURE_LAHF_LM)],
-                     &c->x86_capability[cpufeat_word(X86_FEATURE_SYSCALL)]);
+                     &c->x86_capability[FEATURESET_e1c],
+                     &c->x86_capability[FEATURESET_e1d]);
 
        if (c->extended_cpuid_level >= 0x80000004)
                get_model_name(c); /* Default name */
        if (c->extended_cpuid_level >= 0x80000007)
-               c->x86_capability[cpufeat_word(X86_FEATURE_ITSC)]
-                       = cpuid_edx(0x80000007);
+               c->x86_capability[FEATURESET_e7d] = cpuid_edx(0x80000007);
        if (c->extended_cpuid_level >= 0x80000008)
-               c->x86_capability[cpufeat_word(X86_FEATURE_CLZERO)]
-                       = cpuid_ebx(0x80000008);
+               c->x86_capability[FEATURESET_e8b] = cpuid_ebx(0x80000008);
        if (c->extended_cpuid_level >= 0x80000021)
-               c->x86_capability[cpufeat_word(X86_FEATURE_LFENCE_DISPATCH)]
-                       = cpuid_eax(0x80000021);
+               c->x86_capability[FEATURESET_e21a] = cpuid_eax(0x80000021);
 
        /* Intel-defined flags: level 0x00000007 */
-       if ( c->cpuid_level >= 0x00000007 ) {
-               cpuid_count(0x00000007, 0, &eax,
-                           
&c->x86_capability[cpufeat_word(X86_FEATURE_FSGSBASE)],
-                           &c->x86_capability[cpufeat_word(X86_FEATURE_PKU)],
-                           
&c->x86_capability[cpufeat_word(X86_FEATURE_AVX512_4VNNIW)]);
-               if (eax > 0)
-                       cpuid_count(0x00000007, 1,
-                                   
&c->x86_capability[cpufeat_word(X86_FEATURE_AVX512_BF16)],
+       if (c->cpuid_level >= 7) {
+               uint32_t max_subleaf;
+
+               cpuid_count(7, 0, &max_subleaf,
+                           &c->x86_capability[FEATURESET_7b0],
+                           &c->x86_capability[FEATURESET_7c0],
+                           &c->x86_capability[FEATURESET_7d0]);
+               if (max_subleaf >= 1)
+                       cpuid_count(7, 1,
+                                   &c->x86_capability[FEATURESET_7a1],
                                    &tmp, &tmp, &tmp);
        }
 
        if (c->cpuid_level >= 0xd)
                cpuid_count(0xd, 1,
-                           
&c->x86_capability[cpufeat_word(X86_FEATURE_XSAVEOPT)],
+                           &c->x86_capability[FEATURESET_Da1],
                            &tmp, &tmp, &tmp);
 }
 
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -423,6 +423,8 @@ def write_results(state):
 
 """)
 
+    state.bitfields += ["uint32_t :32 /* placeholder */"] * 4
+
     for idx, text in enumerate(state.bitfields):
         state.output.write(
             "#define CPUID_BITFIELD_%d \\\n    %s\n\n"

++++++ 61f2d887-x86-CPUID-leaf-7-1-EBX-infra.patch ++++++
# Commit e1828e3032ebfe036023cd733adfd2d4ec856688
# Date 2022-01-27 17:38:15 +0000
# Author Jan Beulich <[email protected]>
# Committer Andrew Cooper <[email protected]>
x86/cpuid: Infrastructure for leaf 7:1.ebx

Signed-off-by: Jan Beulich <[email protected]>
Signed-off-by: Andrew Cooper <[email protected]>

--- a/tools/misc/xen-cpuid.c
+++ b/tools/misc/xen-cpuid.c
@@ -193,6 +193,10 @@ static const char *const str_e21a[32] =
     [ 6] = "nscb",
 };
 
+static const char *const str_7b1[32] =
+{
+};
+
 static const struct {
     const char *name;
     const char *abbr;
@@ -211,6 +215,7 @@ static const struct {
     { "0x00000007:0.edx", "7d0", str_7d0 },
     { "0x00000007:1.eax", "7a1", str_7a1 },
     { "0x80000021.eax",  "e21a", str_e21a },
+    { "0x00000007:1.ebx", "7b1", str_7b1 },
 };
 
 #define COL_ALIGN "18"
--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -445,7 +445,8 @@ static void generic_identify(struct cpui
                if (max_subleaf >= 1)
                        cpuid_count(7, 1,
                                    &c->x86_capability[FEATURESET_7a1],
-                                   &tmp, &tmp, &tmp);
+                                   &c->x86_capability[FEATURESET_7b1],
+                                   &tmp, &tmp);
        }
 
        if (c->cpuid_level >= 0xd)
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -297,6 +297,8 @@ XEN_CPUFEATURE(FSRCS,        10*32+12) /
 XEN_CPUFEATURE(LFENCE_DISPATCH,    11*32+ 2) /*A  LFENCE always serializing */
 XEN_CPUFEATURE(NSCB,               11*32+ 6) /*A  Null Selector Clears Base 
(and limit too) */
 
+/* Intel-defined CPU features, CPUID level 0x00000007:1.ebx, word 12 */
+
 #endif /* XEN_CPUFEATURE */
 
 /* Clean up from a default include.  Close the enum (for C). */
--- a/xen/include/xen/lib/x86/cpuid.h
+++ b/xen/include/xen/lib/x86/cpuid.h
@@ -16,6 +16,7 @@
 #define FEATURESET_7d0    9 /* 0x00000007:0.edx    */
 #define FEATURESET_7a1   10 /* 0x00000007:1.eax    */
 #define FEATURESET_e21a  11 /* 0x80000021.eax      */
+#define FEATURESET_7b1   12 /* 0x00000007:1.ebx    */
 
 struct cpuid_leaf
 {
@@ -188,6 +189,10 @@ struct cpuid_policy
                 uint32_t _7a1;
                 struct { DECL_BITFIELD(7a1); };
             };
+            union {
+                uint32_t _7b1;
+                struct { DECL_BITFIELD(7b1); };
+            };
         };
     } feat;
 
@@ -327,6 +332,7 @@ static inline void cpuid_policy_to_featu
     fs[FEATURESET_7d0] = p->feat._7d0;
     fs[FEATURESET_7a1] = p->feat._7a1;
     fs[FEATURESET_e21a] = p->extd.e21a;
+    fs[FEATURESET_7b1] = p->feat._7b1;
 }
 
 /* Fill in a CPUID policy from a featureset bitmap. */
@@ -345,6 +351,7 @@ static inline void cpuid_featureset_to_p
     p->feat._7d0  = fs[FEATURESET_7d0];
     p->feat._7a1  = fs[FEATURESET_7a1];
     p->extd.e21a  = fs[FEATURESET_e21a];
+    p->feat._7b1  = fs[FEATURESET_7b1];
 }
 
 static inline uint64_t cpuid_policy_xcr0_max(const struct cpuid_policy *p)

++++++ 61f2dd76-x86-SPEC_CTRL-migration-compatibility.patch ++++++
# Commit 969a57f73f6b011b2ebf4c0ab1715efc65837335
# Date 2022-01-27 17:59:18 +0000
# Author Andrew Cooper <[email protected]>
# Committer Andrew Cooper <[email protected]>
x86/msr: Fix migration compatibility issue with MSR_SPEC_CTRL

This bug existed in early in 2018 between MSR_SPEC_CTRL arriving in microcode,
and SSBD arriving a few months later.  It went unnoticed presumably because
everyone was busy rebooting everything.

The same bug will reappear when adding PSFD support.

Clamp the guest MSR_SPEC_CTRL value to that permitted by CPUID on migrate.
The guest is already playing with reserved bits at this point, and clamping
the value will prevent a migration to a less capable host from failing.

Signed-off-by: Andrew Cooper <[email protected]>
Reviewed-by: Jan Beulich <[email protected]>

--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1373,6 +1373,7 @@ static const uint32_t msrs_to_send[] = {
 
 static int hvm_save_cpu_msrs(struct vcpu *v, hvm_domain_context_t *h)
 {
+    const struct domain *d = v->domain;
     struct hvm_save_descriptor *desc = _p(&h->data[h->cur]);
     struct hvm_msr *ctxt;
     unsigned int i;
@@ -1388,7 +1389,8 @@ static int hvm_save_cpu_msrs(struct vcpu
     for ( i = 0; i < ARRAY_SIZE(msrs_to_send); ++i )
     {
         uint64_t val;
-        int rc = guest_rdmsr(v, msrs_to_send[i], &val);
+        unsigned int msr = msrs_to_send[i];
+        int rc = guest_rdmsr(v, msr, &val);
 
         /*
          * It is the programmers responsibility to ensure that
@@ -1408,7 +1410,26 @@ static int hvm_save_cpu_msrs(struct vcpu
         if ( !val )
             continue; /* Skip empty MSRs. */
 
-        ctxt->msr[ctxt->count].index = msrs_to_send[i];
+        /*
+         * Guests are given full access to certain MSRs for performance
+         * reasons.  A consequence is that Xen is unable to enforce that all
+         * bits disallowed by the CPUID policy yield #GP, and an enterprising
+         * guest may be able to set and use a bit it ought to leave alone.
+         *
+         * When migrating from a more capable host to a less capable one, such
+         * bits may be rejected by the destination, and the migration failed.
+         *
+         * Discard such bits here on the source side.  Such bits have reserved
+         * behaviour, and the guest has only itself to blame.
+         */
+        switch ( msr )
+        {
+        case MSR_SPEC_CTRL:
+            val &= msr_spec_ctrl_valid_bits(d->arch.cpuid);
+            break;
+        }
+
+        ctxt->msr[ctxt->count].index = msr;
         ctxt->msr[ctxt->count++].val = val;
     }
 
--- a/xen/include/asm-x86/msr.h
+++ b/xen/include/asm-x86/msr.h
@@ -277,6 +277,8 @@ static inline void wrmsr_tsc_aux(uint32_
     }
 }
 
+uint64_t msr_spec_ctrl_valid_bits(const struct cpuid_policy *cp);
+
 extern struct msr_policy     raw_msr_policy,
                             host_msr_policy,
                           pv_max_msr_policy,
--- a/xen/arch/x86/msr.c
+++ b/xen/arch/x86/msr.c
@@ -435,6 +435,24 @@ int guest_rdmsr(struct vcpu *v, uint32_t
     return X86EMUL_EXCEPTION;
 }
 
+/*
+ * Caller to confirm that MSR_SPEC_CTRL is available.  Intel and AMD have
+ * separate CPUID features for this functionality, but only set will be
+ * active.
+ */
+uint64_t msr_spec_ctrl_valid_bits(const struct cpuid_policy *cp)
+{
+    bool ssbd = cp->feat.ssbd;
+
+    /*
+     * Note: SPEC_CTRL_STIBP is specified as safe to use (i.e. ignored)
+     * when STIBP isn't enumerated in hardware.
+     */
+    return (SPEC_CTRL_IBRS | SPEC_CTRL_STIBP |
+            (ssbd       ? SPEC_CTRL_SSBD       : 0) |
+            0);
+}
+
 int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
 {
     const struct vcpu *curr = current;
@@ -508,18 +526,9 @@ int guest_wrmsr(struct vcpu *v, uint32_t
         break;
 
     case MSR_SPEC_CTRL:
-        if ( !cp->feat.ibrsb )
-            goto gp_fault; /* MSR available? */
-
-        /*
-         * Note: SPEC_CTRL_STIBP is specified as safe to use (i.e. ignored)
-         * when STIBP isn't enumerated in hardware.
-         */
-        rsvd = ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP |
-                 (cp->feat.ssbd ? SPEC_CTRL_SSBD : 0));
-
-        if ( val & rsvd )
-            goto gp_fault; /* Rsvd bit set? */
+        if ( !cp->feat.ibrsb ||
+             (val & ~msr_spec_ctrl_valid_bits(cp)) )
+            goto gp_fault;
         goto set_reg;
 
     case MSR_PRED_CMD:

++++++ 61f7b2af-libxl-dont-touch-nr_vcpus_out-if-listing.patch ++++++
References: bsc#1191668, bsc#1194267

# Commit d9d3496e817ace919092d70d4730257b37c2e743
# Date 2022-01-31 10:58:07 +0100
# Author Dario Faggioli <[email protected]>
# Committer Jan Beulich <[email protected]>
tools/libs/light: don't touch nr_vcpus_out if listing vcpus and returning NULL

If we are in libxl_list_vcpu() and we are returning NULL, let's avoid
touching the output parameter *nr_vcpus_out, which the caller should
have initialized to 0.

The current behavior could be problematic if are creating a domain and,
in the meantime, an existing one is destroyed when we have already done
some steps of the loop. At which point, we'd return a NULL list of vcpus
but with something different than 0 as the number of vcpus in that list.
And this can cause troubles in the callers (e.g., nr_vcpus_on_nodes()),
when they do a libxl_vcpuinfo_list_free().

Crashes due to this are rare and difficult to reproduce, but have been
observed, with stack traces looking like this one:

#0  libxl_bitmap_dispose (map=map@entry=0x50) at libxl_utils.c:626
#1  0x00007fe72c993a32 in libxl_vcpuinfo_dispose (p=p@entry=0x38) at 
_libxl_types.c:692
#2  0x00007fe72c94e3c4 in libxl_vcpuinfo_list_free (list=0x0, nr=<optimized 
out>) at libxl_utils.c:1059
#3  0x00007fe72c9528bf in nr_vcpus_on_nodes (vcpus_on_node=0x7fe71000eb60, 
suitable_cpumap=0x7fe721df0d38, tinfo_elements=48, tinfo=0x7fe7101b3900, 
gc=0x7fe7101bbfa0) at libxl_numa.c:258
#4  libxl__get_numa_candidate (gc=gc@entry=0x7fe7100033a0, 
min_free_memkb=4233216, min_cpus=4, min_nodes=min_nodes@entry=0, 
max_nodes=max_nodes@entry=0, 
suitable_cpumap=suitable_cpumap@entry=0x7fe721df0d38, numa_cmpf=0x7fe72c940110 
<numa_cmpf>, cndt_out=0x7fe721df0cf0, cndt_found=0x7fe721df0cb4) at 
libxl_numa.c:394
#5  0x00007fe72c94152b in numa_place_domain (d_config=0x7fe721df11b0, 
domid=975, gc=0x7fe7100033a0) at libxl_dom.c:209
#6  libxl__build_pre (gc=gc@entry=0x7fe7100033a0, domid=domid@entry=975, 
d_config=d_config@entry=0x7fe721df11b0, state=state@entry=0x7fe710077700) at 
libxl_dom.c:436
#7  0x00007fe72c92c4a5 in libxl__domain_build (gc=0x7fe7100033a0, 
d_config=d_config@entry=0x7fe721df11b0, domid=975, state=0x7fe710077700) at 
libxl_create.c:444
#8  0x00007fe72c92de8b in domcreate_bootloader_done (egc=0x7fe721df0f60, 
bl=0x7fe7100778c0, rc=<optimized out>) at libxl_create.c:1222
#9  0x00007fe72c980425 in libxl__bootloader_run (egc=egc@entry=0x7fe721df0f60, 
bl=bl@entry=0x7fe7100778c0) at libxl_bootloader.c:403
#10 0x00007fe72c92f281 in initiate_domain_create (egc=egc@entry=0x7fe721df0f60, 
dcs=dcs@entry=0x7fe7100771b0) at libxl_create.c:1159
#11 0x00007fe72c92f456 in do_domain_create (ctx=ctx@entry=0x7fe71001c840, 
d_config=d_config@entry=0x7fe721df11b0, domid=domid@entry=0x7fe721df10a8, 
restore_fd=restore_fd@entry=-1, send_back_fd=send_back_fd@entry=-1, 
params=params@entry=0x0, ao_how=0x0, aop_console_how=0x7fe721df10f0) at 
libxl_create.c:1856
#12 0x00007fe72c92f776 in libxl_domain_create_new (ctx=0x7fe71001c840, 
d_config=d_config@entry=0x7fe721df11b0, domid=domid@entry=0x7fe721df10a8, 
ao_how=ao_how@entry=0x0, aop_console_how=aop_console_how@entry=0x7fe721df10f0) 
at libxl_create.c:2075

Signed-off-by: Dario Faggioli <[email protected]>
Tested-by: James Fehlig <[email protected]>
Reviewed-by: Anthony PERARD <[email protected]>

--- a/tools/libs/light/libxl_domain.c
+++ b/tools/libs/light/libxl_domain.c
@@ -1661,6 +1661,7 @@ libxl_vcpuinfo *libxl_list_vcpu(libxl_ct
     libxl_vcpuinfo *ptr, *ret;
     xc_domaininfo_t domaininfo;
     xc_vcpuinfo_t vcpuinfo;
+    unsigned int nr_vcpus;
 
     if (xc_domain_getinfolist(ctx->xch, domid, 1, &domaininfo) != 1) {
         LOGED(ERROR, domid, "Getting infolist");
@@ -1677,33 +1678,34 @@ libxl_vcpuinfo *libxl_list_vcpu(libxl_ct
     ret = ptr = libxl__calloc(NOGC, domaininfo.max_vcpu_id + 1,
                               sizeof(libxl_vcpuinfo));
 
-    for (*nr_vcpus_out = 0;
-         *nr_vcpus_out <= domaininfo.max_vcpu_id;
-         ++*nr_vcpus_out, ++ptr) {
+    for (nr_vcpus = 0;
+         nr_vcpus <= domaininfo.max_vcpu_id;
+         ++nr_vcpus, ++ptr) {
         libxl_bitmap_init(&ptr->cpumap);
         if (libxl_cpu_bitmap_alloc(ctx, &ptr->cpumap, 0))
             goto err;
         libxl_bitmap_init(&ptr->cpumap_soft);
         if (libxl_cpu_bitmap_alloc(ctx, &ptr->cpumap_soft, 0))
             goto err;
-        if (xc_vcpu_getinfo(ctx->xch, domid, *nr_vcpus_out, &vcpuinfo) == -1) {
+        if (xc_vcpu_getinfo(ctx->xch, domid, nr_vcpus, &vcpuinfo) == -1) {
             LOGED(ERROR, domid, "Getting vcpu info");
             goto err;
         }
 
-        if (xc_vcpu_getaffinity(ctx->xch, domid, *nr_vcpus_out,
+        if (xc_vcpu_getaffinity(ctx->xch, domid, nr_vcpus,
                                 ptr->cpumap.map, ptr->cpumap_soft.map,
                                 XEN_VCPUAFFINITY_SOFT|XEN_VCPUAFFINITY_HARD) 
== -1) {
             LOGED(ERROR, domid, "Getting vcpu affinity");
             goto err;
         }
-        ptr->vcpuid = *nr_vcpus_out;
+        ptr->vcpuid = nr_vcpus;
         ptr->cpu = vcpuinfo.cpu;
         ptr->online = !!vcpuinfo.online;
         ptr->blocked = !!vcpuinfo.blocked;
         ptr->running = !!vcpuinfo.running;
         ptr->vcpu_time = vcpuinfo.cpu_time;
     }
+    *nr_vcpus_out = nr_vcpus;
     GC_FREE;
     return ret;
 

++++++ 61f933a4-x86-cpuid-advertise-SSB_NO.patch ++++++
# Commit 15b7611efd497c4b65f350483857082cb70fc348
# Date 2022-02-01 13:20:44 +0000
# Author Andrew Cooper <[email protected]>
# Committer Andrew Cooper <[email protected]>
x86/cpuid: Advertise SSB_NO to guests by default

This is a statement of hardware behaviour, and not related to controls for the
guest kernel to use.  Pass it straight through from hardware.

Signed-off-by: Andrew Cooper <[email protected]>
Reviewed-by: Jan Beulich <[email protected]>

--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -264,7 +264,7 @@ XEN_CPUFEATURE(NO_LMSL,       8*32+20) /
 XEN_CPUFEATURE(AMD_PPIN,      8*32+23) /*   Protected Processor Inventory 
Number */
 XEN_CPUFEATURE(AMD_SSBD,      8*32+24) /*   MSR_SPEC_CTRL.SSBD available */
 XEN_CPUFEATURE(VIRT_SSBD,     8*32+25) /*   MSR_VIRT_SPEC_CTRL.SSBD */
-XEN_CPUFEATURE(SSB_NO,        8*32+26) /*   Hardware not vulnerable to SSB */
+XEN_CPUFEATURE(SSB_NO,        8*32+26) /*A  Hardware not vulnerable to SSB */
 XEN_CPUFEATURE(PSFD,          8*32+28) /*   MSR_SPEC_CTRL.PSFD */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0.edx, word 9 */

++++++ 61f933a5-x86-drop-use_spec_ctrl-boolean.patch ++++++
# Commit ec083bf552c35e10347449e21809f4780f8155d2
# Date 2022-02-01 13:20:44 +0000
# Author Andrew Cooper <[email protected]>
# Committer Andrew Cooper <[email protected]>
x86/spec-ctrl: Drop use_spec_ctrl boolean

Several bugfixes have reduced the utility of this variable from it's original
purpose, and now all it does is aid in the setup of SCF_ist_wrmsr.

Simplify the logic by drop the variable, and doubling up the setting of
SCF_ist_wrmsr for the PV and HVM blocks, which will make the AMD SPEC_CTRL
support easier to follow.  Leave a comment explaining why SCF_ist_wrmsr is
still necessary for the VMExit case.

No functional change.

Signed-off-by: Andrew Cooper <[email protected]>
Reviewed-by: Jan Beulich <[email protected]>

--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -927,7 +927,7 @@ static __init void mds_calculations(uint
 void __init init_speculation_mitigations(void)
 {
     enum ind_thunk thunk = THUNK_DEFAULT;
-    bool use_spec_ctrl = false, ibrs = false, hw_smt_enabled;
+    bool ibrs = false, hw_smt_enabled;
     bool cpu_has_bug_taa;
     uint64_t caps = 0;
 
@@ -1016,19 +1016,21 @@ void __init init_speculation_mitigations
     {
         if ( opt_msr_sc_pv )
         {
-            use_spec_ctrl = true;
+            default_spec_ctrl_flags |= SCF_ist_wrmsr;
             setup_force_cpu_cap(X86_FEATURE_SC_MSR_PV);
         }
 
         if ( opt_msr_sc_hvm )
         {
-            use_spec_ctrl = true;
+            /*
+             * While the guest MSR_SPEC_CTRL value is loaded/saved atomically,
+             * Xen's value is not restored atomically.  An early NMI hitting
+             * the VMExit path needs to restore Xen's value for safety.
+             */
+            default_spec_ctrl_flags |= SCF_ist_wrmsr;
             setup_force_cpu_cap(X86_FEATURE_SC_MSR_HVM);
         }
 
-        if ( use_spec_ctrl )
-            default_spec_ctrl_flags |= SCF_ist_wrmsr;
-
         if ( ibrs )
             default_xen_spec_ctrl |= SPEC_CTRL_IBRS;
     }

++++++ 61f933a6-x86-new-has_spec_ctrl-boolean.patch ++++++
# Commit 5d9eff3a312763d889cfbf3c8468b6dfb3ab490c
# Date 2022-02-01 13:20:44 +0000
# Author Andrew Cooper <[email protected]>
# Committer Andrew Cooper <[email protected]>
x86/spec-ctrl: Introduce new has_spec_ctrl boolean

Most MSR_SPEC_CTRL setup will be common between Intel and AMD.  Instead of
opencoding an OR of two features everywhere, introduce has_spec_ctrl instead.

Reword the comment above the Intel specific alternatives block to highlight
that it is Intel specific, and pull the setting of default_xen_spec_ctrl.IBRS
out because it will want to be common.

No functional change.

Signed-off-by: Andrew Cooper <[email protected]>
Reviewed-by: Jan Beulich <[email protected]>

--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -927,7 +927,7 @@ static __init void mds_calculations(uint
 void __init init_speculation_mitigations(void)
 {
     enum ind_thunk thunk = THUNK_DEFAULT;
-    bool ibrs = false, hw_smt_enabled;
+    bool has_spec_ctrl, ibrs = false, hw_smt_enabled;
     bool cpu_has_bug_taa;
     uint64_t caps = 0;
 
@@ -936,6 +936,8 @@ void __init init_speculation_mitigations
 
     hw_smt_enabled = check_smt_enabled();
 
+    has_spec_ctrl = boot_cpu_has(X86_FEATURE_IBRSB);
+
     /*
      * First, disable the use of retpolines if Xen is using shadow stacks, as
      * they are incompatible.
@@ -973,11 +975,11 @@ void __init init_speculation_mitigations
              */
             else if ( retpoline_safe(caps) )
                 thunk = THUNK_RETPOLINE;
-            else if ( boot_cpu_has(X86_FEATURE_IBRSB) )
+            else if ( has_spec_ctrl )
                 ibrs = true;
         }
         /* Without compiler thunk support, use IBRS if available. */
-        else if ( boot_cpu_has(X86_FEATURE_IBRSB) )
+        else if ( has_spec_ctrl )
             ibrs = true;
     }
 
@@ -1008,10 +1010,7 @@ void __init init_speculation_mitigations
     else if ( thunk == THUNK_JMP )
         setup_force_cpu_cap(X86_FEATURE_IND_THUNK_JMP);
 
-    /*
-     * If we are on hardware supporting MSR_SPEC_CTRL, see about setting up
-     * the alternatives blocks so we can virtualise support for guests.
-     */
+    /* Intel hardware: MSR_SPEC_CTRL alternatives setup. */
     if ( boot_cpu_has(X86_FEATURE_IBRSB) )
     {
         if ( opt_msr_sc_pv )
@@ -1030,11 +1029,12 @@ void __init init_speculation_mitigations
             default_spec_ctrl_flags |= SCF_ist_wrmsr;
             setup_force_cpu_cap(X86_FEATURE_SC_MSR_HVM);
         }
-
-        if ( ibrs )
-            default_xen_spec_ctrl |= SPEC_CTRL_IBRS;
     }
 
+    /* If we have IBRS available, see whether we should use it. */
+    if ( has_spec_ctrl && ibrs )
+        default_xen_spec_ctrl |= SPEC_CTRL_IBRS;
+
     /* If we have SSBD available, see whether we should use it. */
     if ( boot_cpu_has(X86_FEATURE_SSBD) && opt_ssbd )
         default_xen_spec_ctrl |= SPEC_CTRL_SSBD;
@@ -1268,7 +1268,7 @@ void __init init_speculation_mitigations
      * boot won't have any other code running in a position to mount an
      * attack.
      */
-    if ( boot_cpu_has(X86_FEATURE_IBRSB) )
+    if ( has_spec_ctrl )
     {
         bsp_delay_spec_ctrl = !cpu_has_hypervisor && default_xen_spec_ctrl;
 

++++++ 61f933a7-x86-dont-use-spec_ctrl-enter-exit-for-S3.patch ++++++
# Commit 71fac402e05ade7b0af2c34f77517449f6f7e2c1
# Date 2022-02-01 13:20:44 +0000
# Author Andrew Cooper <[email protected]>
# Committer Andrew Cooper <[email protected]>
x86/spec-ctrl: Don't use spec_ctrl_{enter,exit}_idle() for S3

'idle' here refers to hlt/mwait.  The S3 path isn't an idle path - it is a
platform reset.

We need to load default_xen_spec_ctrl unilaterally on the way back up.
Currently it happens as a side effect of X86_FEATURE_SC_MSR_IDLE or the next
return-to-guest, but that's fragile behaviour.

Conversely, there is no need to clear IBRS and flush the store buffers on the
way down; we're microseconds away from cutting power.

Signed-off-by: Andrew Cooper <[email protected]>
Reviewed-by: Jan Beulich <[email protected]>

--- a/xen/arch/x86/acpi/power.c
+++ b/xen/arch/x86/acpi/power.c
@@ -248,7 +248,6 @@ static int enter_state(u32 state)
         error = 0;
 
     ci = get_cpu_info();
-    spec_ctrl_enter_idle(ci);
     /* Avoid NMI/#MC using MSR_SPEC_CTRL until we've reloaded microcode. */
     ci->spec_ctrl_flags &= ~SCF_ist_wrmsr;
 
@@ -295,7 +294,9 @@ static int enter_state(u32 state)
 
     /* Re-enabled default NMI/#MC use of MSR_SPEC_CTRL. */
     ci->spec_ctrl_flags |= (default_spec_ctrl_flags & SCF_ist_wrmsr);
-    spec_ctrl_exit_idle(ci);
+
+    if ( boot_cpu_has(X86_FEATURE_IBRSB) )
+        wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl);
 
     if ( boot_cpu_has(X86_FEATURE_SRBDS_CTRL) )
         wrmsrl(MSR_MCU_OPT_CTRL, default_xen_mcu_opt_ctrl);

++++++ 61f933a8-x86-SPEC_CTRL-record-last-write.patch ++++++
# Commit 00f2992b6c7a9d4090443c1a85bf83224a87eeb9
# Date 2022-02-01 13:20:44 +0000
# Author Andrew Cooper <[email protected]>
# Committer Andrew Cooper <[email protected]>
x86/spec-ctrl: Record the last write to MSR_SPEC_CTRL

In some cases, writes to MSR_SPEC_CTRL do not have interesting side effects,
and we should implement lazy context switching like we do with other MSRs.

In the short term, this will be used by the SVM infrastructure, but I expect
to extend it to other contexts in due course.

Introduce cpu_info.last_spec_ctrl for the purpose, and cache writes made from
the boot/resume paths.  The value can't live in regular per-cpu data when it
is eventually used for PV guests when XPTI might be active.

Signed-off-by: Andrew Cooper <[email protected]>
Reviewed-by: Jan Beulich <[email protected]>

--- a/xen/arch/x86/acpi/power.c
+++ b/xen/arch/x86/acpi/power.c
@@ -296,7 +296,10 @@ static int enter_state(u32 state)
     ci->spec_ctrl_flags |= (default_spec_ctrl_flags & SCF_ist_wrmsr);
 
     if ( boot_cpu_has(X86_FEATURE_IBRSB) )
+    {
         wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl);
+        ci->last_spec_ctrl = default_xen_spec_ctrl;
+    }
 
     if ( boot_cpu_has(X86_FEATURE_SRBDS_CTRL) )
         wrmsrl(MSR_MCU_OPT_CTRL, default_xen_mcu_opt_ctrl);
--- a/xen/include/asm-x86/current.h
+++ b/xen/include/asm-x86/current.h
@@ -56,6 +56,7 @@ struct cpu_info {
     /* See asm-x86/spec_ctrl_asm.h for usage. */
     unsigned int shadow_spec_ctrl;
     uint8_t      xen_spec_ctrl;
+    uint8_t      last_spec_ctrl;
     uint8_t      spec_ctrl_flags;
 
     /*
@@ -73,7 +74,6 @@ struct cpu_info {
      */
     bool         use_pv_cr3;
 
-    unsigned long __pad;
     /* get_stack_bottom() must be 16-byte aligned */
 };
 
--- a/xen/include/asm-x86/spec_ctrl_asm.h
+++ b/xen/include/asm-x86/spec_ctrl_asm.h
@@ -67,6 +67,10 @@
  * steps 2 and 6 will restore the shadow value rather than leaving Xen's value
  * loaded and corrupting the value used in guest context.
  *
+ * Additionally, in some cases it is safe to skip writes to MSR_SPEC_CTRL when
+ * we don't require any of the side effects of an identical write.  Maintain a
+ * per-cpu last_spec_ctrl value for this purpose.
+ *
  * The following ASM fragments implement this algorithm.  See their local
  * comments for further details.
  *  - SPEC_CTRL_ENTRY_FROM_PV
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -1944,9 +1944,12 @@ void __init noreturn __start_xen(unsigne
 
     if ( bsp_delay_spec_ctrl )
     {
-        get_cpu_info()->spec_ctrl_flags &= ~SCF_use_shadow;
+        struct cpu_info *info = get_cpu_info();
+
+        info->spec_ctrl_flags &= ~SCF_use_shadow;
         barrier();
         wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl);
+        info->last_spec_ctrl = default_xen_spec_ctrl;
     }
 
     /* Jump to the 1:1 virtual mappings of cpu0_stack. */
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -322,6 +322,8 @@ static void set_cpu_sibling_map(unsigned
 
 void start_secondary(void *unused)
 {
+    struct cpu_info *info = get_cpu_info();
+
     /*
      * Dont put anything before smp_callin(), SMP booting is so fragile that we
      * want to limit the things done here to the most necessary things.
@@ -378,7 +380,10 @@ void start_secondary(void *unused)
      * microcode.
      */
     if ( boot_cpu_has(X86_FEATURE_IBRSB) )
+    {
         wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl);
+        info->last_spec_ctrl = default_xen_spec_ctrl;
+    }
     if ( boot_cpu_has(X86_FEATURE_SRBDS_CTRL) )
         wrmsrl(MSR_MCU_OPT_CTRL, default_xen_mcu_opt_ctrl);
 
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -1270,6 +1270,9 @@ void __init init_speculation_mitigations
      */
     if ( has_spec_ctrl )
     {
+        struct cpu_info *info = get_cpu_info();
+        unsigned int val;
+
         bsp_delay_spec_ctrl = !cpu_has_hypervisor && default_xen_spec_ctrl;
 
         /*
@@ -1278,15 +1281,16 @@ void __init init_speculation_mitigations
          */
         if ( bsp_delay_spec_ctrl )
         {
-            struct cpu_info *info = get_cpu_info();
-
             info->shadow_spec_ctrl = 0;
             barrier();
             info->spec_ctrl_flags |= SCF_use_shadow;
             barrier();
         }
 
-        wrmsrl(MSR_SPEC_CTRL, bsp_delay_spec_ctrl ? 0 : default_xen_spec_ctrl);
+        val = bsp_delay_spec_ctrl ? 0 : default_xen_spec_ctrl;
+
+        wrmsrl(MSR_SPEC_CTRL, val);
+        info->last_spec_ctrl = val;
     }
 
     if ( boot_cpu_has(X86_FEATURE_SRBDS_CTRL) )

++++++ 61f933a9-x86-SPEC_CTRL-use-common-logic-for-AMD.patch ++++++
# Commit 378f2e6df31442396f0afda19794c5c6091d96f9
# Date 2022-02-01 13:20:44 +0000
# Author Andrew Cooper <[email protected]>
# Committer Andrew Cooper <[email protected]>
x86/spec-ctrl: Use common MSR_SPEC_CTRL logic for AMD

Currently, amd_init_ssbd() works by being the only write to MSR_SPEC_CTRL in
the system.  This ceases to be true when using the common logic.

Include AMD MSR_SPEC_CTRL in has_spec_ctrl to activate the common paths, and
introduce an AMD specific block to control alternatives.  Also update the
boot/resume paths to configure default_xen_spec_ctrl.

svm.h needs an adjustment to remove a dependency on include order.

For now, only active alternatives for HVM - PV will require more work.  No
functional change, as no alternatives are defined yet for HVM yet.

Signed-off-by: Andrew Cooper <[email protected]>
Reviewed-by: Jan Beulich <[email protected]>

--- a/xen/arch/x86/acpi/power.c
+++ b/xen/arch/x86/acpi/power.c
@@ -295,7 +295,7 @@ static int enter_state(u32 state)
     /* Re-enabled default NMI/#MC use of MSR_SPEC_CTRL. */
     ci->spec_ctrl_flags |= (default_spec_ctrl_flags & SCF_ist_wrmsr);
 
-    if ( boot_cpu_has(X86_FEATURE_IBRSB) )
+    if ( boot_cpu_has(X86_FEATURE_IBRSB) || boot_cpu_has(X86_FEATURE_IBRS) )
     {
         wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl);
         ci->last_spec_ctrl = default_xen_spec_ctrl;
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -693,7 +693,7 @@ void amd_init_ssbd(const struct cpuinfo_
                return;
 
        if (cpu_has_amd_ssbd) {
-               wrmsrl(MSR_SPEC_CTRL, opt_ssbd ? SPEC_CTRL_SSBD : 0);
+               /* Handled by common MSR_SPEC_CTRL logic */
                return;
        }
 
--- a/xen/include/asm-x86/hvm/svm/svm.h
+++ b/xen/include/asm-x86/hvm/svm/svm.h
@@ -45,6 +45,9 @@ static inline void svm_invlpga(unsigned
         "a" (linear), "c" (asid));
 }
 
+struct cpu_user_regs;
+struct vcpu;
+
 unsigned long *svm_msrbit(unsigned long *msr_bitmap, uint32_t msr);
 void __update_guest_eip(struct cpu_user_regs *regs, unsigned int inst_len);
 void svm_update_guest_cr(struct vcpu *, unsigned int cr, unsigned int flags);
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -379,7 +379,7 @@ void start_secondary(void *unused)
      * settings.  Note: These MSRs may only become available after loading
      * microcode.
      */
-    if ( boot_cpu_has(X86_FEATURE_IBRSB) )
+    if ( boot_cpu_has(X86_FEATURE_IBRSB) || boot_cpu_has(X86_FEATURE_IBRS) )
     {
         wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl);
         info->last_spec_ctrl = default_xen_spec_ctrl;
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -22,6 +22,7 @@
 #include <xen/param.h>
 #include <xen/warning.h>
 
+#include <asm/hvm/svm/svm.h>
 #include <asm/microcode.h>
 #include <asm/msr.h>
 #include <asm/pv/domain.h>
@@ -936,7 +937,8 @@ void __init init_speculation_mitigations
 
     hw_smt_enabled = check_smt_enabled();
 
-    has_spec_ctrl = boot_cpu_has(X86_FEATURE_IBRSB);
+    has_spec_ctrl = (boot_cpu_has(X86_FEATURE_IBRSB) ||
+                     boot_cpu_has(X86_FEATURE_IBRS));
 
     /*
      * First, disable the use of retpolines if Xen is using shadow stacks, as
@@ -1031,12 +1033,32 @@ void __init init_speculation_mitigations
         }
     }
 
+    /* AMD hardware: MSR_SPEC_CTRL alternatives setup. */
+    if ( boot_cpu_has(X86_FEATURE_IBRS) )
+    {
+        /*
+         * Virtualising MSR_SPEC_CTRL for guests depends on SVM support, which
+         * on real hardware matches the availability of MSR_SPEC_CTRL in the
+         * first place.
+         *
+         * No need for SCF_ist_wrmsr because Xen's value is restored
+         * atomically WRT NMIs in the VMExit path.
+         *
+         * TODO: Adjust cpu_has_svm_spec_ctrl to be usable earlier on boot.
+         */
+        if ( opt_msr_sc_hvm &&
+             (boot_cpu_data.extended_cpuid_level >= 0x8000000a) &&
+             (cpuid_edx(0x8000000a) & (1u << SVM_FEATURE_SPEC_CTRL)) )
+            setup_force_cpu_cap(X86_FEATURE_SC_MSR_HVM);
+    }
+
     /* If we have IBRS available, see whether we should use it. */
     if ( has_spec_ctrl && ibrs )
         default_xen_spec_ctrl |= SPEC_CTRL_IBRS;
 
     /* If we have SSBD available, see whether we should use it. */
-    if ( boot_cpu_has(X86_FEATURE_SSBD) && opt_ssbd )
+    if ( opt_ssbd && (boot_cpu_has(X86_FEATURE_SSBD) ||
+                      boot_cpu_has(X86_FEATURE_AMD_SSBD)) )
         default_xen_spec_ctrl |= SPEC_CTRL_SSBD;
 
     /*

++++++ 61f933aa-SVM-SPEC_CTRL-entry-exit-logic.patch ++++++
# Commit 614cec7d79d76786f5638a6e4da0576b57732ca1
# Date 2022-02-01 13:20:44 +0000
# Author Andrew Cooper <[email protected]>
# Committer Andrew Cooper <[email protected]>
x86/svm: VMEntry/Exit logic for MSR_SPEC_CTRL

Hardware maintains both host and guest versions of MSR_SPEC_CTRL, but guests
run with the logical OR of both values.  Therefore, in principle we want to
clear Xen's value before entering the guest.  However, for migration
compatibility (future work), and for performance reasons with SEV-SNP guests,
we want the ability to use a nonzero value behind the guest's back.  Use
vcpu_msrs to hold this value, with the guest value in the VMCB.

On the VMEntry path, adjusting MSR_SPEC_CTRL must be done after CLGI so as to
be atomic with respect to NMIs/etc.

Signed-off-by: Andrew Cooper <[email protected]>
Reviewed-by: Jan Beulich <[email protected]>

--- a/xen/arch/x86/hvm/svm/entry.S
+++ b/xen/arch/x86/hvm/svm/entry.S
@@ -55,11 +55,23 @@ __UNLIKELY_END(nsvm_hap)
         mov  %rsp, %rdi
         call svm_vmenter_helper
 
-        mov VCPU_arch_msrs(%rbx), %rax
-        mov VCPUMSR_spec_ctrl_raw(%rax), %eax
+        clgi
 
         /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */
-        /* SPEC_CTRL_EXIT_TO_SVM   (nothing currently) */
+        /* SPEC_CTRL_EXIT_TO_SVM       Req: b=curr %rsp=regs/cpuinfo, Clob: 
acd */
+        .macro svm_vmentry_spec_ctrl
+            mov    VCPU_arch_msrs(%rbx), %rax
+            movzbl CPUINFO_last_spec_ctrl(%rsp), %edx
+            mov    VCPUMSR_spec_ctrl_raw(%rax), %eax
+            cmp    %edx, %eax
+            je     1f /* Skip write if value is correct. */
+            mov    $MSR_SPEC_CTRL, %ecx
+            xor    %edx, %edx
+            wrmsr
+            mov    %al, CPUINFO_last_spec_ctrl(%rsp)
+1:          /* No Spectre v1 concerns.  Execution will hit VMRUN imminently. */
+        .endm
+        ALTERNATIVE "", svm_vmentry_spec_ctrl, X86_FEATURE_SC_MSR_HVM
 
         pop  %r15
         pop  %r14
@@ -78,7 +90,6 @@ __UNLIKELY_END(nsvm_hap)
         pop  %rsi
         pop  %rdi
 
-        clgi
         sti
         vmrun
 
@@ -86,8 +97,21 @@ __UNLIKELY_END(nsvm_hap)
 
         GET_CURRENT(bx)
 
-        /* SPEC_CTRL_ENTRY_FROM_SVM    Req: b=curr %rsp=regs/cpuinfo, Clob: ac 
 */
+        /* SPEC_CTRL_ENTRY_FROM_SVM    Req: %rsp=regs/cpuinfo         Clob: 
acd */
         ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_HVM
+
+        .macro svm_vmexit_spec_ctrl
+            /*
+             * Write to MSR_SPEC_CTRL unconditionally, for the RAS[:32]
+             * flushing side effect.
+             */
+            mov    $MSR_SPEC_CTRL, %ecx
+            movzbl CPUINFO_xen_spec_ctrl(%rsp), %eax
+            xor    %edx, %edx
+            wrmsr
+            mov    %al, CPUINFO_last_spec_ctrl(%rsp)
+        .endm
+        ALTERNATIVE "", svm_vmexit_spec_ctrl, X86_FEATURE_SC_MSR_HVM
         /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
 
         stgi
--- a/xen/include/asm-x86/msr.h
+++ b/xen/include/asm-x86/msr.h
@@ -297,6 +297,15 @@ struct vcpu_msrs
      *
      * For VT-x guests, the guest value is held in the MSR guest load/save
      * list.
+     *
+     * For SVM, the guest value lives in the VMCB, and hardware saves/restores
+     * the host value automatically.  However, guests run with the OR of the
+     * host and guest value, which allows Xen to set protections behind the
+     * guest's back.
+     *
+     * We must clear/restore Xen's value before/after VMRUN to avoid unduly
+     * influencing the guest.  In order to support "behind the guest's back"
+     * protections, we load this value (commonly 0) before VMRUN.
      */
     struct {
         uint32_t raw;
--- a/xen/include/asm-x86/spec_ctrl_asm.h
+++ b/xen/include/asm-x86/spec_ctrl_asm.h
@@ -46,6 +46,9 @@
  *   - On VMX by using MSR load/save lists to have vmentry/exit atomically
  *     load/save the guest value.  Xen's value is loaded in regular code, and
  *     there is no need to use the shadow logic (below).
+ *   - On SVM by altering MSR_SPEC_CTRL inside the CLGI/STGI region.  This
+ *     makes the changes atomic with respect to NMIs/etc, so no need for
+ *     shadowing logic.
  *
  * Factor 2 is harder.  We maintain a shadow_spec_ctrl value, and a use_shadow
  * boolean in the per cpu spec_ctrl_flags.  The synchronous use is:
--- a/xen/arch/x86/x86_64/asm-offsets.c
+++ b/xen/arch/x86/x86_64/asm-offsets.c
@@ -126,6 +126,7 @@ void __dummy__(void)
     OFFSET(CPUINFO_pv_cr3, struct cpu_info, pv_cr3);
     OFFSET(CPUINFO_shadow_spec_ctrl, struct cpu_info, shadow_spec_ctrl);
     OFFSET(CPUINFO_xen_spec_ctrl, struct cpu_info, xen_spec_ctrl);
+    OFFSET(CPUINFO_last_spec_ctrl, struct cpu_info, last_spec_ctrl);
     OFFSET(CPUINFO_spec_ctrl_flags, struct cpu_info, spec_ctrl_flags);
     OFFSET(CPUINFO_root_pgt_changed, struct cpu_info, root_pgt_changed);
     OFFSET(CPUINFO_use_pv_cr3, struct cpu_info, use_pv_cr3);

++++++ 61f933ab-x86-AMD-SPEC_CTRL-infra.patch ++++++
# Commit 22b9add22b4a9af37305c8441fec12cb26bd142b
# Date 2022-02-01 13:20:44 +0000
# Author Andrew Cooper <[email protected]>
# Committer Andrew Cooper <[email protected]>
x86/msr: AMD MSR_SPEC_CTRL infrastructure

Fill in VMCB accessors for spec_ctrl in svm_{get,set}_reg(), and CPUID checks
for all supported bits in guest_{rd,wr}msr().

Signed-off-by: Andrew Cooper <[email protected]>
Reviewed-by: Jan Beulich <[email protected]>

--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -2471,10 +2471,14 @@ static bool svm_get_pending_event(struct
 
 static uint64_t svm_get_reg(struct vcpu *v, unsigned int reg)
 {
+    const struct vmcb_struct *vmcb = v->arch.hvm.svm.vmcb;
     struct domain *d = v->domain;
 
     switch ( reg )
     {
+    case MSR_SPEC_CTRL:
+        return vmcb->spec_ctrl;
+
     default:
         printk(XENLOG_G_ERR "%s(%pv, 0x%08x) Bad register\n",
                __func__, v, reg);
@@ -2485,10 +2489,15 @@ static uint64_t svm_get_reg(struct vcpu
 
 static void svm_set_reg(struct vcpu *v, unsigned int reg, uint64_t val)
 {
+    struct vmcb_struct *vmcb = v->arch.hvm.svm.vmcb;
     struct domain *d = v->domain;
 
     switch ( reg )
     {
+    case MSR_SPEC_CTRL:
+        vmcb->spec_ctrl = val;
+        break;
+
     default:
         printk(XENLOG_G_ERR "%s(%pv, 0x%08x, 0x%016"PRIx64") Bad register\n",
                __func__, v, reg, val);
--- a/xen/arch/x86/msr.c
+++ b/xen/arch/x86/msr.c
@@ -264,7 +264,7 @@ int guest_rdmsr(struct vcpu *v, uint32_t
         break;
 
     case MSR_SPEC_CTRL:
-        if ( !cp->feat.ibrsb )
+        if ( !cp->feat.ibrsb && !cp->extd.ibrs )
             goto gp_fault;
         goto get_reg;
 
@@ -442,7 +442,8 @@ int guest_rdmsr(struct vcpu *v, uint32_t
  */
 uint64_t msr_spec_ctrl_valid_bits(const struct cpuid_policy *cp)
 {
-    bool ssbd = cp->feat.ssbd;
+    bool ssbd = cp->feat.ssbd || cp->extd.amd_ssbd;
+    bool psfd = cp->extd.psfd;
 
     /*
      * Note: SPEC_CTRL_STIBP is specified as safe to use (i.e. ignored)
@@ -450,6 +451,7 @@ uint64_t msr_spec_ctrl_valid_bits(const
      */
     return (SPEC_CTRL_IBRS | SPEC_CTRL_STIBP |
             (ssbd       ? SPEC_CTRL_SSBD       : 0) |
+            (psfd       ? SPEC_CTRL_PSFD       : 0) |
             0);
 }
 
@@ -526,7 +528,7 @@ int guest_wrmsr(struct vcpu *v, uint32_t
         break;
 
     case MSR_SPEC_CTRL:
-        if ( !cp->feat.ibrsb ||
+        if ( (!cp->feat.ibrsb && !cp->extd.ibrs) ||
              (val & ~msr_spec_ctrl_valid_bits(cp)) )
             goto gp_fault;
         goto set_reg;

++++++ 61f933ac-SVM-enable-MSR_SPEC_CTRL-for-guests.patch ++++++
# Commit a7e7c7260cde78a148810db5320cbf39686c3e09
# Date 2022-02-01 13:20:44 +0000
# Author Andrew Cooper <[email protected]>
# Committer Andrew Cooper <[email protected]>
x86/cpuid: Enable MSR_SPEC_CTRL in SVM guests by default

With all other pieces in place, MSR_SPEC_CTRL is fully working for HVM guests.

Update the CPUID derivation logic (both PV and HVM to avoid losing subtle
changes), drop the MSR intercept, and explicitly enable the CPUID bits for HVM
guests.

Signed-off-by: Andrew Cooper <[email protected]>
Reviewed-by: Jan Beulich <[email protected]>

--- a/xen/arch/x86/cpuid.c
+++ b/xen/arch/x86/cpuid.c
@@ -396,6 +396,8 @@ static void __init guest_common_feature_
      */
     if ( test_bit(X86_FEATURE_IBRSB, fs) )
         __set_bit(X86_FEATURE_STIBP, fs);
+    if ( test_bit(X86_FEATURE_IBRS, fs) )
+        __set_bit(X86_FEATURE_AMD_STIBP, fs);
 
     /*
      * On hardware which supports IBRS/IBPB, we can offer IBPB independently
@@ -419,11 +421,14 @@ static void __init calculate_pv_max_poli
         pv_featureset[i] &= pv_max_featuremask[i];
 
     /*
-     * If Xen isn't virtualising MSR_SPEC_CTRL for PV guests because of
-     * administrator choice, hide the feature.
+     * If Xen isn't virtualising MSR_SPEC_CTRL for PV guests (functional
+     * availability, or admin choice), hide the feature.
      */
     if ( !boot_cpu_has(X86_FEATURE_SC_MSR_PV) )
+    {
         __clear_bit(X86_FEATURE_IBRSB, pv_featureset);
+        __clear_bit(X86_FEATURE_IBRS, pv_featureset);
+    }
 
     guest_common_feature_adjustments(pv_featureset);
 
@@ -493,11 +498,14 @@ static void __init calculate_hvm_max_pol
         __set_bit(X86_FEATURE_SEP, hvm_featureset);
 
     /*
-     * If Xen isn't virtualising MSR_SPEC_CTRL for HVM guests because of
-     * administrator choice, hide the feature.
+     * If Xen isn't virtualising MSR_SPEC_CTRL for HVM guests (functional
+     * availability, or admin choice), hide the feature.
      */
     if ( !boot_cpu_has(X86_FEATURE_SC_MSR_HVM) )
+    {
         __clear_bit(X86_FEATURE_IBRSB, hvm_featureset);
+        __clear_bit(X86_FEATURE_IBRS, hvm_featureset);
+    }
 
     /*
      * With VT-x, some features are only supported by Xen if dedicated
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -606,6 +606,10 @@ static void svm_cpuid_policy_changed(str
 
     vmcb_set_exception_intercepts(vmcb, bitmap);
 
+    /* Give access to MSR_SPEC_CTRL if the guest has been told about it. */
+    svm_intercept_msr(v, MSR_SPEC_CTRL,
+                      cp->extd.ibrs ? MSR_INTERCEPT_NONE : MSR_INTERCEPT_RW);
+
     /* Give access to MSR_PRED_CMD if the guest has been told about it. */
     svm_intercept_msr(v, MSR_PRED_CMD,
                       cp->extd.ibpb ? MSR_INTERCEPT_NONE : MSR_INTERCEPT_RW);
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -254,18 +254,18 @@ XEN_CPUFEATURE(CLZERO,        8*32+ 0) /
 XEN_CPUFEATURE(RSTR_FP_ERR_PTRS, 8*32+ 2) /*A  (F)X{SAVE,RSTOR} always 
saves/restores FPU Error pointers */
 XEN_CPUFEATURE(WBNOINVD,      8*32+ 9) /*   WBNOINVD instruction */
 XEN_CPUFEATURE(IBPB,          8*32+12) /*A  IBPB support only (no IBRS, used 
by AMD) */
-XEN_CPUFEATURE(IBRS,          8*32+14) /*   MSR_SPEC_CTRL.IBRS */
-XEN_CPUFEATURE(AMD_STIBP,     8*32+15) /*   MSR_SPEC_CTRL.STIBP */
-XEN_CPUFEATURE(IBRS_ALWAYS,   8*32+16) /*   IBRS preferred always on */
-XEN_CPUFEATURE(STIBP_ALWAYS,  8*32+17) /*   STIBP preferred always on */
-XEN_CPUFEATURE(IBRS_FAST,     8*32+18) /*   IBRS preferred over software 
options */
-XEN_CPUFEATURE(IBRS_SAME_MODE, 8*32+19) /*   IBRS provides same-mode 
protection */
+XEN_CPUFEATURE(IBRS,          8*32+14) /*S  MSR_SPEC_CTRL.IBRS */
+XEN_CPUFEATURE(AMD_STIBP,     8*32+15) /*S  MSR_SPEC_CTRL.STIBP */
+XEN_CPUFEATURE(IBRS_ALWAYS,   8*32+16) /*S  IBRS preferred always on */
+XEN_CPUFEATURE(STIBP_ALWAYS,  8*32+17) /*S  STIBP preferred always on */
+XEN_CPUFEATURE(IBRS_FAST,     8*32+18) /*S  IBRS preferred over software 
options */
+XEN_CPUFEATURE(IBRS_SAME_MODE, 8*32+19) /*S  IBRS provides same-mode 
protection */
 XEN_CPUFEATURE(NO_LMSL,       8*32+20) /*S  EFER.LMSLE no longer supported. */
 XEN_CPUFEATURE(AMD_PPIN,      8*32+23) /*   Protected Processor Inventory 
Number */
-XEN_CPUFEATURE(AMD_SSBD,      8*32+24) /*   MSR_SPEC_CTRL.SSBD available */
+XEN_CPUFEATURE(AMD_SSBD,      8*32+24) /*S  MSR_SPEC_CTRL.SSBD available */
 XEN_CPUFEATURE(VIRT_SSBD,     8*32+25) /*   MSR_VIRT_SPEC_CTRL.SSBD */
 XEN_CPUFEATURE(SSB_NO,        8*32+26) /*A  Hardware not vulnerable to SSB */
-XEN_CPUFEATURE(PSFD,          8*32+28) /*   MSR_SPEC_CTRL.PSFD */
+XEN_CPUFEATURE(PSFD,          8*32+28) /*S  MSR_SPEC_CTRL.PSFD */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0.edx, word 9 */
 XEN_CPUFEATURE(AVX512_4VNNIW, 9*32+ 2) /*A  AVX512 Neural Network Instructions 
*/
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -277,16 +277,20 @@ def crunch_numbers(state):
         # The features:
         #   * Single Thread Indirect Branch Predictors
         #   * Speculative Store Bypass Disable
+        #   * Predictive Store Forward Disable
         #
-        # enumerate new bits in MSR_SPEC_CTRL, which is enumerated by Indirect
-        # Branch Restricted Speculation/Indirect Branch Prediction Barrier.
+        # enumerate new bits in MSR_SPEC_CTRL, and technically enumerate
+        # MSR_SPEC_CTRL itself.  AMD further enumerates hints to guide OS
+        # behaviour.
         #
-        # In practice, these features also enumerate the presense of
-        # MSR_SPEC_CTRL.  However, no real hardware will exist with SSBD but
-        # not IBRSB, and we pass this MSR directly to guests.  Treating them
+        # However, no real hardware will exist with e.g. SSBD but not
+        # IBRSB/IBRS, and we pass this MSR directly to guests.  Treating them
         # as dependent features simplifies Xen's logic, and prevents the guest
         # from seeing implausible configurations.
         IBRSB: [STIBP, SSBD],
+        IBRS: [AMD_STIBP, AMD_SSBD, PSFD,
+               IBRS_ALWAYS, IBRS_FAST, IBRS_SAME_MODE],
+        AMD_STIBP: [STIBP_ALWAYS],
 
         # In principle the TSXLDTRK insns could also be considered independent.
         RTM: [TSXLDTRK],

++++++ 61f946a2-VMX-drop-SPEC_CTRL-load-on-VMEntry.patch ++++++
# Commit 9ce3ef20b4f085a7dc8ee41b0fec6fdeced3773e
# Date 2022-02-01 14:41:38 +0000
# Author Andrew Cooper <[email protected]>
# Committer Andrew Cooper <[email protected]>
x86/vmx: Drop spec_ctrl load in VMEntry path

This is not needed now that the VMEntry path is not responsible for loading
the guest's MSR_SPEC_CTRL value.

Fixes: 81f0eaadf84d ("x86/spec-ctrl: Fix NMI race condition with VT-x 
MSR_SPEC_CTRL handling")
Signed-off-by: Andrew Cooper <[email protected]>
Reviewed-by: Jan Beulich <[email protected]>

--- a/xen/arch/x86/hvm/vmx/entry.S
+++ b/xen/arch/x86/hvm/vmx/entry.S
@@ -85,9 +85,6 @@ UNLIKELY_END(realmode)
         test %al, %al
         jz .Lvmx_vmentry_restart
 
-        mov VCPU_arch_msrs(%rbx), %rax
-        mov VCPUMSR_spec_ctrl_raw(%rax), %eax
-
         /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */
         /* SPEC_CTRL_EXIT_TO_VMX   Req: %rsp=regs/cpuinfo              Clob:   
 */
         ALTERNATIVE "", __stringify(verw CPUINFO_verw_sel(%rsp)), 
X86_FEATURE_SC_VERW_HVM

++++++ 6202afa3-x86-clean-up-MSR_MCU_OPT_CTRL-handling.patch ++++++
# Commit 39a40f3835efcc25c1b05a25c321a01d7e11cbd7
# Date 2022-02-08 18:00:08 +0000
# Author Andrew Cooper <[email protected]>
# Committer Andrew Cooper <[email protected]>
x86/spec-ctrl: Clean up MSR_MCU_OPT_CTRL handling

Introduce cpu_has_srbds_ctrl as more users are going to appear shortly.

MSR_MCU_OPT_CTRL is gaining extra functionality, meaning that the current
default_xen_mcu_opt_ctrl is no longer a good fit.

Introduce two new helpers, update_mcu_opt_ctrl() which does a full RMW cycle
on the MSR, and set_in_mcu_opt_ctrl() which lets callers configure specific
bits at a time without clobbering each others settings.

No functional change.

Signed-off-by: Andrew Cooper <[email protected]>
Reviewed-by: Jan Beulich <[email protected]>

--- a/xen/arch/x86/acpi/power.c
+++ b/xen/arch/x86/acpi/power.c
@@ -301,8 +301,7 @@ static int enter_state(u32 state)
         ci->last_spec_ctrl = default_xen_spec_ctrl;
     }
 
-    if ( boot_cpu_has(X86_FEATURE_SRBDS_CTRL) )
-        wrmsrl(MSR_MCU_OPT_CTRL, default_xen_mcu_opt_ctrl);
+    update_mcu_opt_ctrl();
 
     /* (re)initialise SYSCALL/SYSENTER state, amongst other things. */
     percpu_traps_init();
--- a/xen/arch/x86/cpu/intel.c
+++ b/xen/arch/x86/cpu/intel.c
@@ -15,6 +15,38 @@
 #include "cpu.h"
 
 /*
+ * MSR_MCU_OPT_CTRL is a collection of unrelated functionality, with separate
+ * enablement requirements, but which want to be consistent across the system.
+ */
+static uint32_t __read_mostly mcu_opt_ctrl_mask;
+static uint32_t __read_mostly mcu_opt_ctrl_val;
+
+void update_mcu_opt_ctrl(void)
+{
+    uint32_t mask = mcu_opt_ctrl_mask, lo, hi;
+
+    if ( !mask )
+        return;
+
+    rdmsr(MSR_MCU_OPT_CTRL, lo, hi);
+
+    lo &= ~mask;
+    lo |= mcu_opt_ctrl_val;
+
+    wrmsr(MSR_MCU_OPT_CTRL, lo, hi);
+}
+
+void __init set_in_mcu_opt_ctrl(uint32_t mask, uint32_t val)
+{
+    mcu_opt_ctrl_mask |= mask;
+
+    mcu_opt_ctrl_val &= ~mask;
+    mcu_opt_ctrl_val |= (val & mask);
+
+    update_mcu_opt_ctrl();
+}
+
+/*
  * Processors which have self-snooping capability can handle conflicting
  * memory type across CPUs by snooping its own cache. However, there exists
  * CPU models in which having conflicting memory types still leads to
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -133,6 +133,7 @@
 #define cpu_has_avx512_4vnniw   boot_cpu_has(X86_FEATURE_AVX512_4VNNIW)
 #define cpu_has_avx512_4fmaps   boot_cpu_has(X86_FEATURE_AVX512_4FMAPS)
 #define cpu_has_avx512_vp2intersect 
boot_cpu_has(X86_FEATURE_AVX512_VP2INTERSECT)
+#define cpu_has_srbds_ctrl      boot_cpu_has(X86_FEATURE_SRBDS_CTRL)
 #define cpu_has_rtm_always_abort boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT)
 #define cpu_has_tsx_force_abort boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)
 #define cpu_has_serialize       boot_cpu_has(X86_FEATURE_SERIALIZE)
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -630,6 +630,9 @@ extern int8_t opt_tsx, cpu_has_tsx_ctrl;
 extern bool rtm_disabled;
 void tsx_init(void);
 
+void update_mcu_opt_ctrl(void);
+void set_in_mcu_opt_ctrl(uint32_t mask, uint32_t val);
+
 enum ap_boot_method {
     AP_BOOT_NORMAL,
     AP_BOOT_SKINIT,
--- a/xen/include/asm-x86/spec_ctrl.h
+++ b/xen/include/asm-x86/spec_ctrl.h
@@ -54,8 +54,6 @@ extern int8_t opt_pv_l1tf_hwdom, opt_pv_
  */
 extern paddr_t l1tf_addr_mask, l1tf_safe_maddr;
 
-extern uint64_t default_xen_mcu_opt_ctrl;
-
 static inline void init_shadow_spec_ctrl_state(void)
 {
     struct cpu_info *info = get_cpu_info();
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -384,8 +384,7 @@ void start_secondary(void *unused)
         wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl);
         info->last_spec_ctrl = default_xen_spec_ctrl;
     }
-    if ( boot_cpu_has(X86_FEATURE_SRBDS_CTRL) )
-        wrmsrl(MSR_MCU_OPT_CTRL, default_xen_mcu_opt_ctrl);
+    update_mcu_opt_ctrl();
 
     tsx_init(); /* Needs microcode.  May change HLE/RTM feature bits. */
 
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -67,7 +67,6 @@ static bool __initdata cpu_has_bug_msbds
 static bool __initdata cpu_has_bug_mds; /* Any other M{LP,SB,FB}DS 
combination. */
 
 static int8_t __initdata opt_srb_lock = -1;
-uint64_t __read_mostly default_xen_mcu_opt_ctrl;
 
 static int __init parse_spec_ctrl(const char *s)
 {
@@ -376,7 +375,7 @@ static void __init print_details(enum in
            (default_xen_spec_ctrl & SPEC_CTRL_SSBD)  ? " SSBD+" : " SSBD-",
            !(caps & ARCH_CAPS_TSX_CTRL)              ? "" :
            (opt_tsx & 1)                             ? " TSX+" : " TSX-",
-           !boot_cpu_has(X86_FEATURE_SRBDS_CTRL)     ? "" :
+           !cpu_has_srbds_ctrl                       ? "" :
            opt_srb_lock                              ? " SRB_LOCK+" : " 
SRB_LOCK-",
            opt_ibpb                                  ? " IBPB"  : "",
            opt_l1d_flush                             ? " L1D_FLUSH" : "",
@@ -1251,32 +1250,24 @@ void __init init_speculation_mitigations
         tsx_init();
     }
 
-    /* Calculate suitable defaults for MSR_MCU_OPT_CTRL */
-    if ( boot_cpu_has(X86_FEATURE_SRBDS_CTRL) )
+    /*
+     * On some SRBDS-affected hardware, it may be safe to relax srb-lock by
+     * default.
+     *
+     * On parts which enumerate MDS_NO and not TAA_NO, TSX is the only known
+     * way to access the Fill Buffer.  If TSX isn't available (inc. SKU
+     * reasons on some models), or TSX is explicitly disabled, then there is
+     * no need for the extra overhead to protect RDRAND/RDSEED.
+     */
+    if ( cpu_has_srbds_ctrl )
     {
-        uint64_t val;
-
-        rdmsrl(MSR_MCU_OPT_CTRL, val);
-
-        /*
-         * On some SRBDS-affected hardware, it may be safe to relax srb-lock
-         * by default.
-         *
-         * On parts which enumerate MDS_NO and not TAA_NO, TSX is the only way
-         * to access the Fill Buffer.  If TSX isn't available (inc. SKU
-         * reasons on some models), or TSX is explicitly disabled, then there
-         * is no need for the extra overhead to protect RDRAND/RDSEED.
-         */
         if ( opt_srb_lock == -1 &&
              (caps & (ARCH_CAPS_MDS_NO|ARCH_CAPS_TAA_NO)) == ARCH_CAPS_MDS_NO 
&&
              (!cpu_has_hle || ((caps & ARCH_CAPS_TSX_CTRL) && rtm_disabled)) )
             opt_srb_lock = 0;
 
-        val &= ~MCU_OPT_CTRL_RNGDS_MITG_DIS;
-        if ( !opt_srb_lock )
-            val |= MCU_OPT_CTRL_RNGDS_MITG_DIS;
-
-        default_xen_mcu_opt_ctrl = val;
+        set_in_mcu_opt_ctrl(MCU_OPT_CTRL_RNGDS_MITG_DIS,
+                            opt_srb_lock ? 0 : MCU_OPT_CTRL_RNGDS_MITG_DIS);
     }
 
     print_details(thunk, caps);
@@ -1314,9 +1305,6 @@ void __init init_speculation_mitigations
         wrmsrl(MSR_SPEC_CTRL, val);
         info->last_spec_ctrl = val;
     }
-
-    if ( boot_cpu_has(X86_FEATURE_SRBDS_CTRL) )
-        wrmsrl(MSR_MCU_OPT_CTRL, default_xen_mcu_opt_ctrl);
 }
 
 static void __init __maybe_unused build_assertions(void)

++++++ 6202afa4-x86-TSX-move-has_rtm_always_abort.patch ++++++
# Commit 4116139131e93b4f075e5442e3c1b424280f6f1f
# Date 2022-02-08 18:00:08 +0000
# Author Andrew Cooper <[email protected]>
# Committer Andrew Cooper <[email protected]>
x86/tsx: Move has_rtm_always_abort to an outer scope

We are about to introduce a second path which needs to conditionally force the
presence of RTM_ALWAYS_ABORT.

No functional change.

Signed-off-by: Andrew Cooper <[email protected]>
Reviewed-by: Jan Beulich <[email protected]>

--- a/xen/arch/x86/tsx.c
+++ b/xen/arch/x86/tsx.c
@@ -42,6 +42,7 @@ void tsx_init(void)
     if ( unlikely(cpu_has_tsx_ctrl < 0) )
     {
         uint64_t caps = 0;
+        bool has_rtm_always_abort;
 
         if ( boot_cpu_data.cpuid_level >= 7 )
             boot_cpu_data.x86_capability[cpufeat_word(X86_FEATURE_ARCH_CAPS)]
@@ -51,6 +52,7 @@ void tsx_init(void)
             rdmsrl(MSR_ARCH_CAPABILITIES, caps);
 
         cpu_has_tsx_ctrl = !!(caps & ARCH_CAPS_TSX_CTRL);
+        has_rtm_always_abort = cpu_has_rtm_always_abort;
 
         if ( cpu_has_tsx_force_abort )
         {
@@ -67,11 +69,7 @@ void tsx_init(void)
              * RTM_ALWAYS_ABORT enumerates the new functionality, but is also
              * read as zero if TSX_FORCE_ABORT.ENABLE_RTM has been set before
              * we run.
-             *
-             * Undo this behaviour in Xen's view of the world.
              */
-            bool has_rtm_always_abort = cpu_has_rtm_always_abort;
-
             if ( !has_rtm_always_abort )
             {
                 uint64_t val;
@@ -83,15 +81,6 @@ void tsx_init(void)
             }
 
             /*
-             * Always force RTM_ALWAYS_ABORT, even if it currently visible.
-             * If the user explicitly opts to enable TSX, we'll set
-             * TSX_FORCE_ABORT.ENABLE_RTM and cause RTM_ALWAYS_ABORT to be
-             * hidden from the general CPUID scan later.
-             */
-            if ( has_rtm_always_abort )
-                setup_force_cpu_cap(X86_FEATURE_RTM_ALWAYS_ABORT);
-
-            /*
              * If no explicit tsx= option is provided, pick a default.
              *
              * This deliberately overrides the implicit opt_tsx=-3 from
@@ -108,10 +97,19 @@ void tsx_init(void)
              * With RTM_ALWAYS_ABORT, disable TSX.
              */
             if ( opt_tsx < 0 )
-                opt_tsx = !cpu_has_rtm_always_abort;
+                opt_tsx = !has_rtm_always_abort;
         }
 
         /*
+         * Always force RTM_ALWAYS_ABORT, even if it currently visible.  If
+         * the user explicitly opts to enable TSX, we'll set the appropriate
+         * RTM_ENABLE bit and cause RTM_ALWAYS_ABORT to be hidden from the
+         * general CPUID scan later.
+         */
+        if ( has_rtm_always_abort )
+            setup_force_cpu_cap(X86_FEATURE_RTM_ALWAYS_ABORT);
+
+        /*
          * The TSX features (HLE/RTM) are handled specially.  They both
          * enumerate features but, on certain parts, have mechanisms to be
          * hidden without disrupting running software.

++++++ 6202afa5-x86-TSX-cope-with-deprecation-on-WHL-R-CFL-R.patch ++++++
# Commit ad9f7c3b2e0df38ad6d54f4769d4dccf765fbcee
# Date 2022-02-08 18:00:08 +0000
# Author Andrew Cooper <[email protected]>
# Committer Andrew Cooper <[email protected]>
x86/tsx: Cope with TSX deprecation on WHL-R/CFL-R

The February 2022 microcode is formally de-featuring TSX on the TAA-impacted
client CPUs.  The backup TAA mitigation (VERW regaining its flushing side
effect) is being dropped, meaning that `smt=0 spec-ctrl=md-clear` no longer
protects against TAA on these parts.

The new functionality enumerates itself via the RTM_ALWAYS_ABORT CPUID
bit (the same as June 2021), but has its control in MSR_MCU_OPT_CTRL as
opposed to MSR_TSX_FORCE_ABORT.

TSX now defaults to being disabled on ucode load.  Furthermore, if SGX is
enabled in the BIOS, TSX is locked and cannot be re-enabled.  In this case,
override opt_tsx to 0, so the RTM/HLE CPUID bits get hidden by default.

While updating the command line documentation, take the opportunity to add a
paragraph explaining what TSX being disabled actually means, and how migration
compatibility works.

Signed-off-by: Andrew Cooper <[email protected]>
Reviewed-by: Jan Beulich <[email protected]>

--- a/docs/misc/xen-command-line.pandoc
+++ b/docs/misc/xen-command-line.pandoc
@@ -2371,7 +2371,9 @@ Several microcode updates are relevant:
    Introduced MSR_TSX_CTRL on all TSX-enabled MDS_NO parts to date,
    CLX/WHL-R/CFL-R, with the controls becoming architectural moving forward
    and formally retiring HLE from the architecture.  The user can disable TSX
-   to mitigate TAA, and elect to hide the HLE/RTM CPUID bits.
+   to mitigate TAA, and elect to hide the HLE/RTM CPUID bits.  Also causes
+   VERW to once-again flush the microarchiectural buffers in case a TAA
+   mitigation is wanted along with TSX being enabled.
 
  * June 2021, removing the workaround for March 2019 on client CPUs and
    formally de-featured TSX on SKL/KBL/WHL/CFL (Note: SKX still retains the
@@ -2379,19 +2381,32 @@ Several microcode updates are relevant:
    PCR3 works fine, and TSX is disabled by default, but the user can re-enable
    TSX at their own risk, accepting that the memory order erratum is unfixed.
 
+ * February 2022, removing the VERW flushing workaround from November 2019 on
+   client CPUs and formally de-featuring TSX on WHL-R/CFL-R (Note: CLX still
+   retains the VERW flushing workaround).  TSX defaults to disabled, and is
+   locked off when SGX is enabled in the BIOS.  When SGX is not enabled, TSX
+   can be re-enabled at the users own risk, as it reintroduces the TSX Async
+   Abort speculative vulnerability.
+
 On systems with the ability to configure TSX, this boolean offers system wide
 control of whether TSX is enabled or disabled.
 
+When TSX is disabled, transactions unconditionally abort.  This is compatible
+with the TSX spec, which requires software to have a non-transactional path as
+a fallback.  The RTM and HLE CPUID bits are hidden from VMs by default, but
+can be re-enabled if required.  This allows VMs which previously saw RTM/HLE
+to be migrated in, although any TSX-enabled software will run with reduced
+performance.
+
+ * When TSX is locked off by firmware, `tsx=` is ignored and treated as
+   `false`.
+
  * An explicit `tsx=` choice is honoured, even if it is `true` and would
    result in a vulnerable system.
 
  * When no explicit `tsx=` choice is given, parts vulnerable to TAA will be
    mitigated by disabling TSX, as this is the lowest overhead option.
 
-   If the use of TSX is important, the more expensive TAA mitigations can be
-   opted in to with `smt=0 spec-ctrl=md-clear`, at which point TSX will remain
-   active by default.
-
  * When no explicit `tsx=` option is given, parts susceptible to the memory
    ordering errata default to `true` to enable working TSX.  Alternatively,
    selecting `tsx=0` will disable TSX and restore PCR3 to a working state.
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -78,6 +78,8 @@
 
 #define MSR_MCU_OPT_CTRL                    0x00000123
 #define  MCU_OPT_CTRL_RNGDS_MITG_DIS        (_AC(1, ULL) <<  0)
+#define  MCU_OPT_CTRL_RTM_ALLOW             (_AC(1, ULL) <<  1)
+#define  MCU_OPT_CTRL_RTM_LOCKED            (_AC(1, ULL) <<  2)
 
 #define MSR_RTIT_OUTPUT_BASE                0x00000560
 #define MSR_RTIT_OUTPUT_MASK                0x00000561
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -1233,11 +1233,14 @@ void __init init_speculation_mitigations
      * the MDS mitigation of disabling HT and using VERW flushing.
      *
      * On CPUs which advertise MDS_NO, VERW has no flushing side effect until
-     * the TSX_CTRL microcode is loaded, despite the MD_CLEAR CPUID bit being
+     * the TSX_CTRL microcode (Nov 2019), despite the MD_CLEAR CPUID bit being
      * advertised, and there isn't a MD_CLEAR_2 flag to use...
      *
+     * Furthermore, the VERW flushing side effect is removed again on client
+     * parts with the Feb 2022 microcode.
+     *
      * If we're on affected hardware, able to do something about it (which
-     * implies that VERW now works), no explicit TSX choice and traditional
+     * implies that VERW might work), no explicit TSX choice and traditional
      * MDS mitigations (no-SMT, VERW) not obviosuly in use (someone might
      * plausibly value TSX higher than Hyperthreading...), disable TSX to
      * mitigate TAA.
--- a/xen/arch/x86/tsx.c
+++ b/xen/arch/x86/tsx.c
@@ -14,6 +14,9 @@
  * This is arranged such that the bottom bit encodes whether TSX is actually
  * disabled, while identifying various explicit (>=0) and implicit (<0)
  * conditions.
+ *
+ * This option only has any effect on systems presenting a mechanism of
+ * controlling TSX behaviour, and where TSX isn't force-disabled by firmware.
  */
 int8_t __read_mostly opt_tsx = -1;
 int8_t __read_mostly cpu_has_tsx_ctrl = -1;
@@ -54,6 +57,66 @@ void tsx_init(void)
         cpu_has_tsx_ctrl = !!(caps & ARCH_CAPS_TSX_CTRL);
         has_rtm_always_abort = cpu_has_rtm_always_abort;
 
+        if ( cpu_has_tsx_ctrl && cpu_has_srbds_ctrl )
+        {
+            /*
+             * On a TAA-vulnerable or later part with at least the May 2020
+             * microcode mitigating SRBDS.
+             */
+            uint64_t val;
+
+            rdmsrl(MSR_MCU_OPT_CTRL, val);
+
+            /*
+             * Probe for the February 2022 microcode which de-features TSX on
+             * TAA-vulnerable client parts - WHL-R/CFL-R.
+             *
+             * RTM_ALWAYS_ABORT (read above) enumerates the new functionality,
+             * but is read as zero if MCU_OPT_CTRL.RTM_ALLOW has been set
+             * before we run.  Undo this.
+             */
+            if ( val & MCU_OPT_CTRL_RTM_ALLOW )
+                has_rtm_always_abort = true;
+
+            if ( has_rtm_always_abort )
+            {
+                if ( val & MCU_OPT_CTRL_RTM_LOCKED )
+                {
+                    /*
+                     * If RTM_LOCKED is set, TSX is disabled because SGX is
+                     * enabled, and there is nothing we can do.  Override with
+                     * tsx=0 so all other logic takes sensible actions.
+                     */
+                    printk(XENLOG_WARNING "TSX locked by firmware - 
disabling\n");
+                    opt_tsx = 0;
+                }
+                else
+                {
+                    /*
+                     * Otherwise, set RTM_ALLOW.  Not because we necessarily
+                     * intend to enable RTM, but it prevents
+                     * MSR_TSX_CTRL.RTM_DISABLE from being ignored, thus
+                     * allowing the rest of the TSX selection logic to work as
+                     * before.
+                     */
+                    val |= MCU_OPT_CTRL_RTM_ALLOW;
+                }
+
+                set_in_mcu_opt_ctrl(
+                    MCU_OPT_CTRL_RTM_LOCKED | MCU_OPT_CTRL_RTM_ALLOW, val);
+
+                /*
+                 * If no explicit tsx= option is provided, pick a default.
+                 *
+                 * With RTM_ALWAYS_ABORT, the default ucode behaviour is to
+                 * disable, so match that.  This does not override explicit 
user
+                 * choices, or implicit choices as a side effect of 
spec-ctrl=0.
+                 */
+                if ( opt_tsx == -1 )
+                    opt_tsx = 0;
+            }
+        }
+
         if ( cpu_has_tsx_force_abort )
         {
             /*
@@ -142,6 +205,19 @@ void tsx_init(void)
      */
     if ( cpu_has_tsx_ctrl )
     {
+        /*
+         * On a TAA-vulnerable part with at least the November 2019 microcode,
+         * or newer part with TAA fixed.
+         *
+         * Notes:
+         *  - With the February 2022 microcode, if SGX has caused TSX to be
+         *    locked off, opt_tsx is overridden to 0.  TSX_CTRL.RTM_DISABLE is
+         *    an ignored bit, but we write it such that it matches the
+         *    behaviour enforced by microcode.
+         *  - Otherwise, if SGX isn't enabled and TSX is available to be
+         *    controlled, we have or will set MSR_MCU_OPT_CTRL.RTM_ALLOW to
+         *    let TSX_CTRL.RTM_DISABLE be usable.
+         */
         uint32_t hi, lo;
 
         rdmsr(MSR_TSX_CTRL, lo, hi);

++++++ 6202afa7-x86-CPUID-leaf-7-2-EDX-infra.patch ++++++
# Commit f3709b15fc86c6c6a0959cec8d97f21d0e9f9629
# Date 2022-02-08 18:00:08 +0000
# Author Andrew Cooper <[email protected]>
# Committer Andrew Cooper <[email protected]>
x86/cpuid: Infrastructure for cpuid word 7:2.edx

While in principle it would be nice to keep leaf 7 in order, that would
involve having an extra 5 words of zeros in a featureset.

Signed-off-by: Andrew Cooper <[email protected]>
Reviewed-by: Jan Beulich <[email protected]>

--- a/tools/misc/xen-cpuid.c
+++ b/tools/misc/xen-cpuid.c
@@ -197,6 +197,10 @@ static const char *const str_7b1[32] =
 {
 };
 
+static const char *const str_7d2[32] =
+{
+};
+
 static const struct {
     const char *name;
     const char *abbr;
@@ -216,6 +220,7 @@ static const struct {
     { "0x00000007:1.eax", "7a1", str_7a1 },
     { "0x80000021.eax",  "e21a", str_e21a },
     { "0x00000007:1.ebx", "7b1", str_7b1 },
+    { "0x00000007:2.edx", "7d2", str_7d2 },
 };
 
 #define COL_ALIGN "18"
--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -447,6 +447,10 @@ static void generic_identify(struct cpui
                                    &c->x86_capability[FEATURESET_7a1],
                                    &c->x86_capability[FEATURESET_7b1],
                                    &tmp, &tmp);
+               if (max_subleaf >= 2)
+                       cpuid_count(7, 2,
+                                   &tmp, &tmp, &tmp,
+                                   &c->x86_capability[FEATURESET_7d2]);
        }
 
        if (c->cpuid_level >= 0xd)
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -299,6 +299,8 @@ XEN_CPUFEATURE(NSCB,               11*32
 
 /* Intel-defined CPU features, CPUID level 0x00000007:1.ebx, word 12 */
 
+/* Intel-defined CPU features, CPUID level 0x00000007:2.edx, word 13 */
+
 #endif /* XEN_CPUFEATURE */
 
 /* Clean up from a default include.  Close the enum (for C). */
--- a/xen/include/xen/lib/x86/cpuid.h
+++ b/xen/include/xen/lib/x86/cpuid.h
@@ -17,6 +17,7 @@
 #define FEATURESET_7a1   10 /* 0x00000007:1.eax    */
 #define FEATURESET_e21a  11 /* 0x80000021.eax      */
 #define FEATURESET_7b1   12 /* 0x00000007:1.ebx    */
+#define FEATURESET_7d2   13 /* 0x80000007:2.edx    */
 
 struct cpuid_leaf
 {
@@ -82,7 +83,7 @@ const char *x86_cpuid_vendor_to_str(unsi
 
 #define CPUID_GUEST_NR_BASIC      (0xdu + 1)
 #define CPUID_GUEST_NR_CACHE      (5u + 1)
-#define CPUID_GUEST_NR_FEAT       (1u + 1)
+#define CPUID_GUEST_NR_FEAT       (2u + 1)
 #define CPUID_GUEST_NR_TOPO       (1u + 1)
 #define CPUID_GUEST_NR_XSTATE     (62u + 1)
 #define CPUID_GUEST_NR_EXTD_INTEL (0x8u + 1)
@@ -193,6 +194,14 @@ struct cpuid_policy
                 uint32_t _7b1;
                 struct { DECL_BITFIELD(7b1); };
             };
+            uint32_t /* c */:32, /* d */:32;
+
+            /* Subleaf 2. */
+            uint32_t /* a */:32, /* b */:32, /* c */:32;
+            union {
+                uint32_t _7d2;
+                struct { DECL_BITFIELD(7d2); };
+            };
         };
     } feat;
 
@@ -333,6 +342,7 @@ static inline void cpuid_policy_to_featu
     fs[FEATURESET_7a1] = p->feat._7a1;
     fs[FEATURESET_e21a] = p->extd.e21a;
     fs[FEATURESET_7b1] = p->feat._7b1;
+    fs[FEATURESET_7d2] = p->feat._7d2;
 }
 
 /* Fill in a CPUID policy from a featureset bitmap. */
@@ -352,6 +362,7 @@ static inline void cpuid_featureset_to_p
     p->feat._7a1  = fs[FEATURESET_7a1];
     p->extd.e21a  = fs[FEATURESET_e21a];
     p->feat._7b1  = fs[FEATURESET_7b1];
+    p->feat._7d2  = fs[FEATURESET_7d2];
 }
 
 static inline uint64_t cpuid_policy_xcr0_max(const struct cpuid_policy *p)

++++++ 6202afa8-x86-Intel-PSFD-for-guests.patch ++++++
# Commit 52ce1c97844db213de01c5300eaaa8cf101a285f
# Date 2022-02-08 18:00:08 +0000
# Author Andrew Cooper <[email protected]>
# Committer Andrew Cooper <[email protected]>
x86/spec-ctrl: Support Intel PSFD for guests

The Feb 2022 microcode from Intel retrofits AMD's MSR_SPEC_CTRL.PSFD interface
to Sunny Cove (IceLake) and later cores.

Update the MSR_SPEC_CTRL emulation, and expose it to guests.

Signed-off-by: Andrew Cooper <[email protected]>
Reviewed-by: Jan Beulich <[email protected]>

--- a/tools/libs/light/libxl_cpuid.c
+++ b/tools/libs/light/libxl_cpuid.c
@@ -234,6 +234,8 @@ int libxl_cpuid_parse_config(libxl_cpuid
         {"fsrs",         0x00000007,  1, CPUID_REG_EAX, 11,  1},
         {"fsrcs",        0x00000007,  1, CPUID_REG_EAX, 12,  1},
 
+        {"intel-psfd",   0x00000007,  2, CPUID_REG_EDX,  0,  1},
+
         {"lahfsahf",     0x80000001, NA, CPUID_REG_ECX,  0,  1},
         {"cmplegacy",    0x80000001, NA, CPUID_REG_ECX,  1,  1},
         {"svm",          0x80000001, NA, CPUID_REG_ECX,  2,  1},
--- a/tools/misc/xen-cpuid.c
+++ b/tools/misc/xen-cpuid.c
@@ -199,6 +199,7 @@ static const char *const str_7b1[32] =
 
 static const char *const str_7d2[32] =
 {
+    [ 0] = "intel-psfd",
 };
 
 static const struct {
--- a/xen/arch/x86/msr.c
+++ b/xen/arch/x86/msr.c
@@ -443,7 +443,7 @@ int guest_rdmsr(struct vcpu *v, uint32_t
 uint64_t msr_spec_ctrl_valid_bits(const struct cpuid_policy *cp)
 {
     bool ssbd = cp->feat.ssbd || cp->extd.amd_ssbd;
-    bool psfd = cp->extd.psfd;
+    bool psfd = cp->feat.intel_psfd || cp->extd.psfd;
 
     /*
      * Note: SPEC_CTRL_STIBP is specified as safe to use (i.e. ignored)
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -307,11 +307,13 @@ custom_param("pv-l1tf", parse_pv_l1tf);
 
 static void __init print_details(enum ind_thunk thunk, uint64_t caps)
 {
-    unsigned int _7d0 = 0, e8b = 0, tmp;
+    unsigned int _7d0 = 0, _7d2 = 0, e8b = 0, max = 0, tmp;
 
     /* Collect diagnostics about available mitigations. */
     if ( boot_cpu_data.cpuid_level >= 7 )
-        cpuid_count(7, 0, &tmp, &tmp, &tmp, &_7d0);
+        cpuid_count(7, 0, &max, &tmp, &tmp, &_7d0);
+    if ( max >= 2 )
+        cpuid_count(7, 2, &tmp, &tmp, &tmp, &_7d2);
     if ( boot_cpu_data.extended_cpuid_level >= 0x80000008 )
         cpuid(0x80000008, &tmp, &e8b, &tmp, &tmp);
 
@@ -345,6 +347,7 @@ static void __init print_details(enum in
            (_7d0 & cpufeat_mask(X86_FEATURE_STIBP))          ? " STIBP"        
  : "",
            (e8b  & cpufeat_mask(X86_FEATURE_AMD_SSBD)) ||
            (_7d0 & cpufeat_mask(X86_FEATURE_SSBD))           ? " SSBD"         
  : "",
+           (_7d2 & cpufeat_mask(X86_FEATURE_INTEL_PSFD)) ||
            (e8b  & cpufeat_mask(X86_FEATURE_PSFD))           ? " PSFD"         
  : "",
            (_7d0 & cpufeat_mask(X86_FEATURE_L1D_FLUSH))      ? " L1D_FLUSH"    
  : "",
            (_7d0 & cpufeat_mask(X86_FEATURE_MD_CLEAR))       ? " MD_CLEAR"     
  : "",
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -300,6 +300,7 @@ XEN_CPUFEATURE(NSCB,               11*32
 /* Intel-defined CPU features, CPUID level 0x00000007:1.ebx, word 12 */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:2.edx, word 13 */
+XEN_CPUFEATURE(INTEL_PSFD,         13*32+ 0) /*A  MSR_SPEC_CTRL.PSFD */
 
 #endif /* XEN_CPUFEATURE */
 
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -287,7 +287,7 @@ def crunch_numbers(state):
         # IBRSB/IBRS, and we pass this MSR directly to guests.  Treating them
         # as dependent features simplifies Xen's logic, and prevents the guest
         # from seeing implausible configurations.
-        IBRSB: [STIBP, SSBD],
+        IBRSB: [STIBP, SSBD, INTEL_PSFD],
         IBRS: [AMD_STIBP, AMD_SSBD, PSFD,
                IBRS_ALWAYS, IBRS_FAST, IBRS_SAME_MODE],
         AMD_STIBP: [STIBP_ALWAYS],

commit xen for openSUSE:Factory

Reply via email to