Script 'mail_helper' called by obssrc
Hello community,

here is the log from the commit of package xen for openSUSE:Factory checked in 
at 2023-04-01 23:26:54
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/xen (Old)
 and      /work/SRC/openSUSE:Factory/.xen.new.9019 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Package is "xen"

Sat Apr  1 23:26:54 2023 rev:330 rq:1075603 version:4.17.0_06

Changes:
--------
--- /work/SRC/openSUSE:Factory/xen/xen.changes  2023-03-11 18:23:12.274619379 
+0100
+++ /work/SRC/openSUSE:Factory/.xen.new.9019/xen.changes        2023-04-01 
23:26:56.439318701 +0200
@@ -1,0 +2,48 @@
+Thu Mar 23 08:10:00 CET 2023 - jbeul...@suse.com
+
+- Upstream bug fixes (bsc#1027519)
+  63a03b73-VMX-VMExit-based-BusLock-detection.patch
+  63a03ba6-VMX-INTR_SHADOW_NMI-helper.patch
+  63a03bce-VMX-Notify-VMExit.patch
+  63e53ac9-x86-CPUID-leaves-7-1-ecx-edx.patch
+  63e53ac9-x86-disable-CET-SS-when-fractured-updates.patch
+  63f4d045-x86-ucode-AMD-apply-early-on-all-threads.patch
+  63fe06e0-x86-ucode-AMD-apply-late-on-all-threads.patch
+  641041e8-VT-d-constrain-IGD-check.patch
+  6419697d-AMD-IOMMU-no-XT-x2APIC-phys.patch
+- Use "proper" upstream backports:
+  640f3035-x86-altp2m-help-gcc13.patch
+  64104238-bunzip-gcc13.patch
+  64199e0c-x86-shadow-account-for-log-dirty-mode.patch
+  64199e0d-x86-HVM-bound-number-of-pca-regions.patch
+  64199e0e-x86-HVM-serialize-pca-list-manipulation.patch
+  64199e0f-x86-spec-ctrl-defer-CR4_PV32_RESTORE-for-CSTAR.patch
+- ... in place of:
+  bunzip-gcc13.patch
+  altp2m-gcc13.patch
+  xsa427.patch
+  xsa428-1.patch
+  xsa428-2.patch
+  xsa429.patch
+
+-------------------------------------------------------------------
+Thu Mar 16 08:08:08 UTC 2023 - oher...@suse.de
+
+- bsc#1209245 - fix host-assisted kexec/kdump for HVM domUs
+  libxl.fix-guest-kexec-skip-cpuid-policy.patch
+
+-------------------------------------------------------------------
+Tue Mar  7 10:44:12 MST 2023 - carn...@suse.com
+
+- bsc#1209017 - VUL-0: CVE-2022-42332: xen: x86 shadow plus
+  log-dirty mode use-after-free (XSA-427)
+  xsa427.patch
+- bsc#1209018 - VUL-0: CVE-2022-42333,CVE-2022-42334: xen: x86/HVM
+  pinned cache attributes mis-handling (XSA-428)
+  xsa428-1.patch
+  xsa428-2.patch
+- bsc#1209019 - VUL-0: CVE-2022-42331: xen: x86: speculative
+  vulnerability in 32bit SYSCALL path (XSA-429)
+  xsa429.patch
+
+-------------------------------------------------------------------

Old:
----
  altp2m-gcc13.patch
  bunzip-gcc13.patch

New:
----
  63a03b73-VMX-VMExit-based-BusLock-detection.patch
  63a03ba6-VMX-INTR_SHADOW_NMI-helper.patch
  63a03bce-VMX-Notify-VMExit.patch
  63e53ac9-x86-CPUID-leaves-7-1-ecx-edx.patch
  63e53ac9-x86-disable-CET-SS-when-fractured-updates.patch
  63f4d045-x86-ucode-AMD-apply-early-on-all-threads.patch
  63fe06e0-x86-ucode-AMD-apply-late-on-all-threads.patch
  640f3035-x86-altp2m-help-gcc13.patch
  641041e8-VT-d-constrain-IGD-check.patch
  64104238-bunzip-gcc13.patch
  6419697d-AMD-IOMMU-no-XT-x2APIC-phys.patch
  64199e0c-x86-shadow-account-for-log-dirty-mode.patch
  64199e0d-x86-HVM-bound-number-of-pca-regions.patch
  64199e0e-x86-HVM-serialize-pca-list-manipulation.patch
  64199e0f-x86-spec-ctrl-defer-CR4_PV32_RESTORE-for-CSTAR.patch
  libxl.fix-guest-kexec-skip-cpuid-policy.patch

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ xen.spec ++++++
--- /var/tmp/diff_new_pack.p2IWAX/_old  2023-04-01 23:26:58.367328836 +0200
+++ /var/tmp/diff_new_pack.p2IWAX/_new  2023-04-01 23:26:58.375328878 +0200
@@ -119,7 +119,7 @@
 %endif
 Provides:       installhint(reboot-needed)
 
-Version:        4.17.0_04
+Version:        4.17.0_06
 Release:        0
 Summary:        Xen Virtualization: Hypervisor (aka VMM aka Microkernel)
 License:        GPL-2.0-only
@@ -155,12 +155,27 @@
 # For xen-libs
 Source99:       baselibs.conf
 # Upstream patches
-Patch1:         63a03e28-x86-high-freq-TSC-overflow.patch
-Patch2:         63c05478-VMX-calculate-model-specific-LBRs-once.patch
-Patch3:         63c05478-VMX-support-CPUs-without-model-specific-LBR.patch
-Patch4:         63d24e91-tools-xenstore-revert-simplify-loop-handling.patch
-Patch5:         
63e4da00-dont-log-errors-when-trying-to-load-PVH-xenstore-stubdom.patch
-Patch6:         
63ebca9c-x86-spec-ctrl-Mitigate-Cross-Thread-Return-Address-Predictions.patch
+Patch1:         63a03b73-VMX-VMExit-based-BusLock-detection.patch
+Patch2:         63a03ba6-VMX-INTR_SHADOW_NMI-helper.patch
+Patch3:         63a03bce-VMX-Notify-VMExit.patch
+Patch4:         63a03e28-x86-high-freq-TSC-overflow.patch
+Patch5:         63c05478-VMX-calculate-model-specific-LBRs-once.patch
+Patch6:         63c05478-VMX-support-CPUs-without-model-specific-LBR.patch
+Patch7:         63d24e91-tools-xenstore-revert-simplify-loop-handling.patch
+Patch8:         
63e4da00-dont-log-errors-when-trying-to-load-PVH-xenstore-stubdom.patch
+Patch9:         63e53ac9-x86-CPUID-leaves-7-1-ecx-edx.patch
+Patch10:        63e53ac9-x86-disable-CET-SS-when-fractured-updates.patch
+Patch11:        
63ebca9c-x86-spec-ctrl-Mitigate-Cross-Thread-Return-Address-Predictions.patch
+Patch12:        63f4d045-x86-ucode-AMD-apply-early-on-all-threads.patch
+Patch13:        63fe06e0-x86-ucode-AMD-apply-late-on-all-threads.patch
+Patch14:        640f3035-x86-altp2m-help-gcc13.patch
+Patch15:        641041e8-VT-d-constrain-IGD-check.patch
+Patch16:        64104238-bunzip-gcc13.patch
+Patch17:        6419697d-AMD-IOMMU-no-XT-x2APIC-phys.patch
+Patch18:        64199e0c-x86-shadow-account-for-log-dirty-mode.patch
+Patch19:        64199e0d-x86-HVM-bound-number-of-pca-regions.patch
+Patch20:        64199e0e-x86-HVM-serialize-pca-list-manipulation.patch
+Patch21:        64199e0f-x86-spec-ctrl-defer-CR4_PV32_RESTORE-for-CSTAR.patch
 # EMBARGOED security fixes
 # libxc
 Patch301:       libxc-bitmap-long.patch
@@ -207,8 +222,6 @@
 # Needs to go upstream
 Patch420:       suspend_evtchn_lock.patch
 Patch421:       vif-route.patch
-Patch422:       bunzip-gcc13.patch
-Patch423:       altp2m-gcc13.patch
 # Other bug fixes or features
 Patch450:       xen.sysconfig-fillup.patch
 Patch451:       xenconsole-no-multiple-connections.patch
@@ -218,6 +231,7 @@
 Patch455:       pygrub-boot-legacy-sles.patch
 Patch456:       pygrub-handle-one-line-menu-entries.patch
 Patch457:       aarch64-rename-PSR_MODE_ELxx-to-match-linux-headers.patch
+Patch460:       libxl.fix-guest-kexec-skip-cpuid-policy.patch
 Patch461:       libxl.max_event_channels.patch
 Patch463:       libxl.add-option-to-disable-disk-cache-flushes-in-qdisk.patch
 Patch464:       libxl.pvscsi.patch

++++++ 63a03b73-VMX-VMExit-based-BusLock-detection.patch ++++++
# Commit f7d07619d2ae0382e2922e287fbfbb27722f3f0b
# Date 2022-12-19 11:22:43 +0100
# Author Roger Pau Monné <roger....@citrix.com>
# Committer Jan Beulich <jbeul...@suse.com>
x86/vmx: implement VMExit based guest Bus Lock detection

Add support for enabling guest Bus Lock Detection on Intel systems.
Such detection works by triggering a vmexit, which ought to be enough
of a pause to prevent a guest from abusing of the Bus Lock.

Add an extra Xen perf counter to track the number of Bus Locks detected.
This is done because Bus Locks can also be reported by setting the bit
26 in the exit reason field, so also account for those.

Note EXIT_REASON_BUS_LOCK VMExits will always have bit 26 set in
exit_reason, and hence the performance counter doesn't need to be
increased for EXIT_REASON_BUS_LOCK handling.

Suggested-by: Andrew Cooper <andrew.coop...@citrix.com>
Signed-off-by: Roger Pau Monné <roger....@citrix.com>
Reviewed-by: Kevin Tian <kevin.t...@intel.com>

--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -209,6 +209,7 @@ static void __init vmx_display_features(
     P(cpu_has_vmx_virt_exceptions, "Virtualisation Exceptions");
     P(cpu_has_vmx_pml, "Page Modification Logging");
     P(cpu_has_vmx_tsc_scaling, "TSC Scaling");
+    P(cpu_has_vmx_bus_lock_detection, "Bus Lock Detection");
 #undef P
 
     if ( !printed )
@@ -318,7 +319,8 @@ static int vmx_init_vmcs_config(bool bsp
                SECONDARY_EXEC_ENABLE_VM_FUNCTIONS |
                SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS |
                SECONDARY_EXEC_XSAVES |
-               SECONDARY_EXEC_TSC_SCALING);
+               SECONDARY_EXEC_TSC_SCALING |
+               SECONDARY_EXEC_BUS_LOCK_DETECTION);
         if ( _vmx_misc_cap & VMX_MISC_VMWRITE_ALL )
             opt |= SECONDARY_EXEC_ENABLE_VMCS_SHADOWING;
         if ( opt_vpid_enabled )
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -4081,6 +4081,12 @@ void vmx_vmexit_handler(struct cpu_user_
             return;
     }
 
+    if ( unlikely(exit_reason & VMX_EXIT_REASONS_BUS_LOCK) )
+    {
+        perfc_incr(buslock);
+        exit_reason &= ~VMX_EXIT_REASONS_BUS_LOCK;
+    }
+
     /* XXX: This looks ugly, but we need a mechanism to ensure
      * any pending vmresume has really happened
      */
@@ -4590,6 +4596,15 @@ void vmx_vmexit_handler(struct cpu_user_
         vmx_handle_descriptor_access(exit_reason);
         break;
 
+    case EXIT_REASON_BUS_LOCK:
+        /*
+         * Nothing to do: just taking a vmexit should be enough of a pause to
+         * prevent a VM from crippling the host with bus locks.  Note
+         * EXIT_REASON_BUS_LOCK will always have bit 26 set in exit_reason, and
+         * hence the perf counter is already increased.
+         */
+        break;
+
     case EXIT_REASON_VMX_PREEMPTION_TIMER_EXPIRED:
     case EXIT_REASON_INVPCID:
     /* fall through */
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -2405,7 +2405,7 @@ void nvmx_idtv_handling(void)
      * be reinjected, otherwise, pass to L1.
      */
     __vmread(VM_EXIT_REASON, &reason);
-    if ( reason != EXIT_REASON_EPT_VIOLATION ?
+    if ( (uint16_t)reason != EXIT_REASON_EPT_VIOLATION ?
          !(nvmx->intr.intr_info & INTR_INFO_VALID_MASK) :
          !nvcpu->nv_vmexit_pending )
     {
@@ -2486,6 +2486,7 @@ int nvmx_n2_vmexit_handler(struct cpu_us
     case EXIT_REASON_EPT_VIOLATION:
     case EXIT_REASON_EPT_MISCONFIG:
     case EXIT_REASON_EXTERNAL_INTERRUPT:
+    case EXIT_REASON_BUS_LOCK:
         /* pass to L0 handler */
         break;
     case VMX_EXIT_REASONS_FAILED_VMENTRY:
--- a/xen/arch/x86/include/asm/hvm/vmx/vmcs.h
+++ b/xen/arch/x86/include/asm/hvm/vmx/vmcs.h
@@ -267,6 +267,7 @@ extern u32 vmx_vmentry_control;
 #define SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS   0x00040000
 #define SECONDARY_EXEC_XSAVES                   0x00100000
 #define SECONDARY_EXEC_TSC_SCALING              0x02000000
+#define SECONDARY_EXEC_BUS_LOCK_DETECTION       0x40000000
 extern u32 vmx_secondary_exec_control;
 
 #define VMX_EPT_EXEC_ONLY_SUPPORTED                         0x00000001
@@ -346,6 +347,8 @@ extern u64 vmx_ept_vpid_cap;
     (vmx_secondary_exec_control & SECONDARY_EXEC_XSAVES)
 #define cpu_has_vmx_tsc_scaling \
     (vmx_secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
+#define cpu_has_vmx_bus_lock_detection \
+    (vmx_secondary_exec_control & SECONDARY_EXEC_BUS_LOCK_DETECTION)
 
 #define VMCS_RID_TYPE_MASK              0x80000000
 
--- a/xen/arch/x86/include/asm/hvm/vmx/vmx.h
+++ b/xen/arch/x86/include/asm/hvm/vmx/vmx.h
@@ -159,6 +159,7 @@ static inline void pi_clear_sn(struct pi
  * Exit Reasons
  */
 #define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000
+#define VMX_EXIT_REASONS_BUS_LOCK       (1u << 26)
 
 #define EXIT_REASON_EXCEPTION_NMI       0
 #define EXIT_REASON_EXTERNAL_INTERRUPT  1
@@ -219,6 +220,7 @@ static inline void pi_clear_sn(struct pi
 #define EXIT_REASON_PML_FULL            62
 #define EXIT_REASON_XSAVES              63
 #define EXIT_REASON_XRSTORS             64
+#define EXIT_REASON_BUS_LOCK            74
 /* Remember to also update VMX_PERF_EXIT_REASON_SIZE! */
 
 /*
--- a/xen/arch/x86/include/asm/perfc_defn.h
+++ b/xen/arch/x86/include/asm/perfc_defn.h
@@ -6,7 +6,7 @@ PERFCOUNTER_ARRAY(exceptions,
 
 #ifdef CONFIG_HVM
 
-#define VMX_PERF_EXIT_REASON_SIZE 65
+#define VMX_PERF_EXIT_REASON_SIZE 75
 #define VMEXIT_NPF_PERFC 143
 #define SVM_PERF_EXIT_REASON_SIZE (VMEXIT_NPF_PERFC + 1)
 PERFCOUNTER_ARRAY(vmexits,              "vmexits",
@@ -128,4 +128,6 @@ PERFCOUNTER(pauseloop_exits, "vmexits fr
 PERFCOUNTER(iommu_pt_shatters,    "IOMMU page table shatters")
 PERFCOUNTER(iommu_pt_coalesces,   "IOMMU page table coalesces")
 
+PERFCOUNTER(buslock, "Bus Locks Detected")
+
 /*#endif*/ /* __XEN_PERFC_DEFN_H__ */

++++++ 63a03ba6-VMX-INTR_SHADOW_NMI-helper.patch ++++++
# Commit d329b37d12132164c3894d0b6284be72576ef950
# Date 2022-12-19 11:23:34 +0100
# Author Roger Pau Monné <roger....@citrix.com>
# Committer Jan Beulich <jbeul...@suse.com>
x86/vmx: introduce helper to set VMX_INTR_SHADOW_NMI

Introduce a small helper to OR VMX_INTR_SHADOW_NMI in
GUEST_INTERRUPTIBILITY_INFO in order to help dealing with the NMI
unblocked by IRET case.  Replace the existing usage in handling
EXIT_REASON_EXCEPTION_NMI and also add such handling to EPT violations
and page-modification log-full events.

Reported-by: Andrew Cooper <andrew.coop...@citrix.com>
Signed-off-by: Roger Pau Monné <roger....@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: Kevin Tian <kevin.t...@intel.com>

--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -3964,6 +3964,15 @@ static int vmx_handle_apic_write(void)
     return vlapic_apicv_write(current, exit_qualification & 0xfff);
 }
 
+static void undo_nmis_unblocked_by_iret(void)
+{
+    unsigned long guest_info;
+
+    __vmread(GUEST_INTERRUPTIBILITY_INFO, &guest_info);
+    __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
+              guest_info | VMX_INTR_SHADOW_NMI);
+}
+
 void vmx_vmexit_handler(struct cpu_user_regs *regs)
 {
     unsigned long exit_qualification, exit_reason, idtv_info, intr_info = 0;
@@ -4164,13 +4173,7 @@ void vmx_vmexit_handler(struct cpu_user_
         if ( unlikely(intr_info & INTR_INFO_NMI_UNBLOCKED_BY_IRET) &&
              !(idtv_info & INTR_INFO_VALID_MASK) &&
              (vector != TRAP_double_fault) )
-        {
-            unsigned long guest_info;
-
-            __vmread(GUEST_INTERRUPTIBILITY_INFO, &guest_info);
-            __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
-                      guest_info | VMX_INTR_SHADOW_NMI);
-        }
+            undo_nmis_unblocked_by_iret();
 
         perfc_incra(cause_vector, vector);
 
@@ -4536,6 +4539,11 @@ void vmx_vmexit_handler(struct cpu_user_
 
         __vmread(GUEST_PHYSICAL_ADDRESS, &gpa);
         __vmread(EXIT_QUALIFICATION, &exit_qualification);
+
+        if ( unlikely(exit_qualification & INTR_INFO_NMI_UNBLOCKED_BY_IRET) &&
+             !(idtv_info & INTR_INFO_VALID_MASK) )
+            undo_nmis_unblocked_by_iret();
+
         ept_handle_violation(exit_qualification, gpa);
         break;
     }
@@ -4580,6 +4588,12 @@ void vmx_vmexit_handler(struct cpu_user_
         break;
 
     case EXIT_REASON_PML_FULL:
+        __vmread(EXIT_QUALIFICATION, &exit_qualification);
+
+        if ( unlikely(exit_qualification & INTR_INFO_NMI_UNBLOCKED_BY_IRET) &&
+             !(idtv_info & INTR_INFO_VALID_MASK) )
+            undo_nmis_unblocked_by_iret();
+
         vmx_vcpu_flush_pml_buffer(v);
         break;
 
--- a/xen/arch/x86/include/asm/hvm/vmx/vmx.h
+++ b/xen/arch/x86/include/asm/hvm/vmx/vmx.h
@@ -225,6 +225,9 @@ static inline void pi_clear_sn(struct pi
 
 /*
  * Interruption-information format
+ *
+ * Note INTR_INFO_NMI_UNBLOCKED_BY_IRET is also used with Exit Qualification
+ * field for EPT violations, PML full and SPP-related event vmexits.
  */
 #define INTR_INFO_VECTOR_MASK           0xff            /* 7:0 */
 #define INTR_INFO_INTR_TYPE_MASK        0x700           /* 10:8 */

++++++ 63a03bce-VMX-Notify-VMExit.patch ++++++
# Commit 573279cde1c4e752d4df34bc65ffafa17573148e
# Date 2022-12-19 11:24:14 +0100
# Author Roger Pau Monné <roger....@citrix.com>
# Committer Jan Beulich <jbeul...@suse.com>
x86/vmx: implement Notify VM Exit

Under certain conditions guests can get the CPU stuck in an unbounded
loop without the possibility of an interrupt window to occur on
instruction boundary.  This was the case with the scenarios described
in XSA-156.

Make use of the Notify VM Exit mechanism, that will trigger a VM Exit
if no interrupt window occurs for a specified amount of time.  Note
that using the Notify VM Exit avoids having to trap #AC and #DB
exceptions, as Xen is guaranteed to get a VM Exit even if the guest
puts the CPU in a loop without an interrupt window, as such disable
the intercepts if the feature is available and enabled.

Setting the notify VM exit window to 0 is safe because there's a
threshold added by the hardware in order to have a sane window value.

Note the handling of EXIT_REASON_NOTIFY in the nested virtualization
case is passed to L0, and hence a nested guest being able to trigger a
notify VM exit with an invalid context would be able to crash the L1
hypervisor (by L0 destroying the domain).  Since we don't expose VM
Notify support to L1 it should already enable the required
protections in order to prevent VM Notify from triggering in the first
place.

Suggested-by: Andrew Cooper <andrew.coop...@citrix.com>
Signed-off-by: Roger Pau Monné <roger....@citrix.com>
Reviewed-by: Kevin Tian <kevin.t...@intel.com>

# Commit 5f08bc9404c7cfa8131e262c7dbcb4d96c752686
# Date 2023-01-20 19:39:32 +0000
# Author Andrew Cooper <andrew.coop...@citrix.com>
# Committer Andrew Cooper <andrew.coop...@citrix.com>
x86/vmx: Partially revert "x86/vmx: implement Notify VM Exit"

The original patch tried to do two things - implement VMNotify, and
re-optimise VT-x to not intercept #DB/#AC by default.

The second part is buggy in multiple ways.  Both GDBSX and Introspection need
to conditionally intercept #DB, which was not accounted for.  Also, #DB
interception has nothing at all to do with cpu_has_monitor_trap_flag.

Revert the second half, leaving #DB/#AC intercepted unilaterally, but with
VMNotify active by default when available.

Fixes: 573279cde1c4 ("x86/vmx: implement Notify VM Exit")
Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Kevin Tian <kevin.t...@intel.com>

--- a/docs/misc/xen-command-line.pandoc
+++ b/docs/misc/xen-command-line.pandoc
@@ -2624,6 +2624,17 @@ guest will notify Xen that it has failed
 <major>, <minor> and <build> must be integers. The values will be
 encoded in guest CPUID 0x40000002 if viridian enlightenments are enabled.
 
+### vm-notify-window (Intel)
+> `= <integer>`
+
+> Default: `0`
+
+Specify the value of the VM Notify window used to detect locked VMs. Set to -1
+to disable the feature.  Value is in units of crystal clock cycles.
+
+Note the hardware might add a threshold to the provided value in order to make
+it safe, and hence using 0 is fine.
+
 ### vpid (Intel)
 > `= <boolean>`
 
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -67,6 +67,9 @@ integer_param("ple_gap", ple_gap);
 static unsigned int __read_mostly ple_window = 4096;
 integer_param("ple_window", ple_window);
 
+static unsigned int __ro_after_init vm_notify_window;
+integer_param("vm-notify-window", vm_notify_window);
+
 static bool __read_mostly opt_ept_pml = true;
 static s8 __read_mostly opt_ept_ad = -1;
 int8_t __read_mostly opt_ept_exec_sp = -1;
@@ -210,6 +213,7 @@ static void __init vmx_display_features(
     P(cpu_has_vmx_pml, "Page Modification Logging");
     P(cpu_has_vmx_tsc_scaling, "TSC Scaling");
     P(cpu_has_vmx_bus_lock_detection, "Bus Lock Detection");
+    P(cpu_has_vmx_notify_vm_exiting, "Notify VM Exit");
 #undef P
 
     if ( !printed )
@@ -329,6 +333,8 @@ static int vmx_init_vmcs_config(bool bsp
             opt |= SECONDARY_EXEC_UNRESTRICTED_GUEST;
         if ( opt_ept_pml )
             opt |= SECONDARY_EXEC_ENABLE_PML;
+        if ( vm_notify_window != ~0u )
+            opt |= SECONDARY_EXEC_NOTIFY_VM_EXITING;
 
         /*
          * "APIC Register Virtualization" and "Virtual Interrupt Delivery"
@@ -1290,6 +1296,10 @@ static int construct_vmcs(struct vcpu *v
     v->arch.hvm.vmx.exception_bitmap = HVM_TRAP_MASK
               | (paging_mode_hap(d) ? 0 : (1U << TRAP_page_fault))
               | (v->arch.fully_eager_fpu ? 0 : (1U << TRAP_no_device));
+
+    if ( cpu_has_vmx_notify_vm_exiting )
+        __vmwrite(NOTIFY_WINDOW, vm_notify_window);
+
     vmx_update_exception_bitmap(v);
 
     v->arch.hvm.guest_cr[0] = X86_CR0_PE | X86_CR0_ET;
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -4619,6 +4619,22 @@ void vmx_vmexit_handler(struct cpu_user_
          */
         break;
 
+    case EXIT_REASON_NOTIFY:
+        __vmread(EXIT_QUALIFICATION, &exit_qualification);
+
+        if ( unlikely(exit_qualification & NOTIFY_VM_CONTEXT_INVALID) )
+        {
+            perfc_incr(vmnotify_crash);
+            gprintk(XENLOG_ERR, "invalid VM context after notify vmexit\n");
+            domain_crash(v->domain);
+            break;
+        }
+
+        if ( unlikely(exit_qualification & INTR_INFO_NMI_UNBLOCKED_BY_IRET) )
+            undo_nmis_unblocked_by_iret();
+
+        break;
+
     case EXIT_REASON_VMX_PREEMPTION_TIMER_EXPIRED:
     case EXIT_REASON_INVPCID:
     /* fall through */
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -2487,6 +2487,7 @@ int nvmx_n2_vmexit_handler(struct cpu_us
     case EXIT_REASON_EPT_MISCONFIG:
     case EXIT_REASON_EXTERNAL_INTERRUPT:
     case EXIT_REASON_BUS_LOCK:
+    case EXIT_REASON_NOTIFY:
         /* pass to L0 handler */
         break;
     case VMX_EXIT_REASONS_FAILED_VMENTRY:
--- a/xen/arch/x86/include/asm/hvm/vmx/vmcs.h
+++ b/xen/arch/x86/include/asm/hvm/vmx/vmcs.h
@@ -268,6 +268,7 @@ extern u32 vmx_vmentry_control;
 #define SECONDARY_EXEC_XSAVES                   0x00100000
 #define SECONDARY_EXEC_TSC_SCALING              0x02000000
 #define SECONDARY_EXEC_BUS_LOCK_DETECTION       0x40000000
+#define SECONDARY_EXEC_NOTIFY_VM_EXITING        0x80000000
 extern u32 vmx_secondary_exec_control;
 
 #define VMX_EPT_EXEC_ONLY_SUPPORTED                         0x00000001
@@ -349,6 +350,8 @@ extern u64 vmx_ept_vpid_cap;
     (vmx_secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
 #define cpu_has_vmx_bus_lock_detection \
     (vmx_secondary_exec_control & SECONDARY_EXEC_BUS_LOCK_DETECTION)
+#define cpu_has_vmx_notify_vm_exiting \
+    (vmx_secondary_exec_control & SECONDARY_EXEC_NOTIFY_VM_EXITING)
 
 #define VMCS_RID_TYPE_MASK              0x80000000
 
@@ -456,6 +459,7 @@ enum vmcs_field {
     SECONDARY_VM_EXEC_CONTROL       = 0x0000401e,
     PLE_GAP                         = 0x00004020,
     PLE_WINDOW                      = 0x00004022,
+    NOTIFY_WINDOW                   = 0x00004024,
     VM_INSTRUCTION_ERROR            = 0x00004400,
     VM_EXIT_REASON                  = 0x00004402,
     VM_EXIT_INTR_INFO               = 0x00004404,
--- a/xen/arch/x86/include/asm/hvm/vmx/vmx.h
+++ b/xen/arch/x86/include/asm/hvm/vmx/vmx.h
@@ -221,6 +221,7 @@ static inline void pi_clear_sn(struct pi
 #define EXIT_REASON_XSAVES              63
 #define EXIT_REASON_XRSTORS             64
 #define EXIT_REASON_BUS_LOCK            74
+#define EXIT_REASON_NOTIFY              75
 /* Remember to also update VMX_PERF_EXIT_REASON_SIZE! */
 
 /*
@@ -237,6 +238,11 @@ static inline void pi_clear_sn(struct pi
 #define INTR_INFO_RESVD_BITS_MASK       0x7ffff000
 
 /*
+ * Exit Qualifications for NOTIFY VM EXIT
+ */
+#define NOTIFY_VM_CONTEXT_INVALID       1u
+
+/*
  * Exit Qualifications for MOV for Control Register Access
  */
 enum {
--- a/xen/arch/x86/include/asm/perfc_defn.h
+++ b/xen/arch/x86/include/asm/perfc_defn.h
@@ -6,7 +6,7 @@ PERFCOUNTER_ARRAY(exceptions,
 
 #ifdef CONFIG_HVM
 
-#define VMX_PERF_EXIT_REASON_SIZE 75
+#define VMX_PERF_EXIT_REASON_SIZE 76
 #define VMEXIT_NPF_PERFC 143
 #define SVM_PERF_EXIT_REASON_SIZE (VMEXIT_NPF_PERFC + 1)
 PERFCOUNTER_ARRAY(vmexits,              "vmexits",
@@ -129,5 +129,6 @@ PERFCOUNTER(iommu_pt_shatters,    "IOMMU
 PERFCOUNTER(iommu_pt_coalesces,   "IOMMU page table coalesces")
 
 PERFCOUNTER(buslock, "Bus Locks Detected")
+PERFCOUNTER(vmnotify_crash, "domain crashes by Notify VM Exit")
 
 /*#endif*/ /* __XEN_PERFC_DEFN_H__ */

++++++ 63e53ac9-x86-CPUID-leaves-7-1-ecx-edx.patch ++++++
# Commit b4a23bf6293aadecfd03bf9e83974443e2eac9cb
# Date 2023-02-09 18:26:17 +0000
# Author Andrew Cooper <andrew.coop...@citrix.com>
# Committer Andrew Cooper <andrew.coop...@citrix.com>
x86/cpuid: Infrastructure for leaves 7:1{ecx,edx}

We don't actually need ecx yet, but adding it in now will reduce the amount to
which leaf 7 is out of order in a featureset.

Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/tools/misc/xen-cpuid.c
+++ b/tools/misc/xen-cpuid.c
@@ -202,6 +202,14 @@ static const char *const str_7b1[32] =
     [ 0] = "ppin",
 };
 
+static const char *const str_7c1[32] =
+{
+};
+
+static const char *const str_7d1[32] =
+{
+};
+
 static const char *const str_7d2[32] =
 {
     [ 0] = "intel-psfd",
@@ -229,6 +237,8 @@ static const struct {
     { "0x80000021.eax",  "e21a", str_e21a },
     { "0x00000007:1.ebx", "7b1", str_7b1 },
     { "0x00000007:2.edx", "7d2", str_7d2 },
+    { "0x00000007:1.ecx", "7c1", str_7c1 },
+    { "0x00000007:1.edx", "7d1", str_7d1 },
 };
 
 #define COL_ALIGN "18"
--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -450,7 +450,8 @@ static void generic_identify(struct cpui
                        cpuid_count(7, 1,
                                    &c->x86_capability[FEATURESET_7a1],
                                    &c->x86_capability[FEATURESET_7b1],
-                                   &tmp, &tmp);
+                                   &c->x86_capability[FEATURESET_7c1],
+                                   &c->x86_capability[FEATURESET_7d1]);
                if (max_subleaf >= 2)
                        cpuid_count(7, 2,
                                    &tmp, &tmp, &tmp,
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -295,6 +295,10 @@ XEN_CPUFEATURE(RRSBA_CTRL,         13*32
 XEN_CPUFEATURE(BHI_CTRL,           13*32+ 4) /*   MSR_SPEC_CTRL.BHI_DIS_S */
 XEN_CPUFEATURE(MCDT_NO,            13*32+ 5) /*A  MCDT_NO */
 
+/* Intel-defined CPU features, CPUID level 0x00000007:1.ecx, word 14 */
+
+/* Intel-defined CPU features, CPUID level 0x00000007:1.edx, word 15 */
+
 #endif /* XEN_CPUFEATURE */
 
 /* Clean up from a default include.  Close the enum (for C). */
--- a/xen/include/xen/lib/x86/cpuid.h
+++ b/xen/include/xen/lib/x86/cpuid.h
@@ -18,6 +18,8 @@
 #define FEATURESET_e21a  11 /* 0x80000021.eax      */
 #define FEATURESET_7b1   12 /* 0x00000007:1.ebx    */
 #define FEATURESET_7d2   13 /* 0x00000007:2.edx    */
+#define FEATURESET_7c1   14 /* 0x00000007:1.ecx    */
+#define FEATURESET_7d1   15 /* 0x00000007:1.edx    */
 
 struct cpuid_leaf
 {
@@ -194,7 +196,14 @@ struct cpuid_policy
                 uint32_t _7b1;
                 struct { DECL_BITFIELD(7b1); };
             };
-            uint32_t /* c */:32, /* d */:32;
+            union {
+                uint32_t _7c1;
+                struct { DECL_BITFIELD(7c1); };
+            };
+            union {
+                uint32_t _7d1;
+                struct { DECL_BITFIELD(7d1); };
+            };
 
             /* Subleaf 2. */
             uint32_t /* a */:32, /* b */:32, /* c */:32;
@@ -343,6 +352,8 @@ static inline void cpuid_policy_to_featu
     fs[FEATURESET_e21a] = p->extd.e21a;
     fs[FEATURESET_7b1] = p->feat._7b1;
     fs[FEATURESET_7d2] = p->feat._7d2;
+    fs[FEATURESET_7c1] = p->feat._7c1;
+    fs[FEATURESET_7d1] = p->feat._7d1;
 }
 
 /* Fill in a CPUID policy from a featureset bitmap. */
@@ -363,6 +374,8 @@ static inline void cpuid_featureset_to_p
     p->extd.e21a  = fs[FEATURESET_e21a];
     p->feat._7b1  = fs[FEATURESET_7b1];
     p->feat._7d2  = fs[FEATURESET_7d2];
+    p->feat._7c1  = fs[FEATURESET_7c1];
+    p->feat._7d1  = fs[FEATURESET_7d1];
 }
 
 static inline uint64_t cpuid_policy_xcr0_max(const struct cpuid_policy *p)

++++++ 63e53ac9-x86-disable-CET-SS-when-fractured-updates.patch ++++++
# Commit 01e7477d1b081cff4288ff9f51ec59ee94c03ee0
# Date 2023-02-09 18:26:17 +0000
# Author Andrew Cooper <andrew.coop...@citrix.com>
# Committer Andrew Cooper <andrew.coop...@citrix.com>
x86/shskt: Disable CET-SS on parts susceptible to fractured updates

Refer to Intel SDM Rev 70 (Dec 2022), Vol3 17.2.3 "Supervisor Shadow Stack
Token".

Architecturally, an event delivery which starts in CPL<3 and switches shadow
stack will first validate the Supervisor Shadow Stack Token (setting the busy
bit), then pushes CS/LIP/SSP.  One example of this is an NMI interrupting Xen.

Some CPUs suffer from an issue called fracturing, whereby a fault/vmexit/etc
between setting the busy bit and completing the event injection renders the
action non-restartable, because when it comes time to restart, the busy bit is
found to be already set.

This is far more easily encountered under virt, yet it is not the fault of the
hypervisor, nor the fault of the guest kernel.  The fault lies somewhere
between the architectural specification, and the uarch behaviour.

Intel have allocated CPUID.7[1].ecx[18] CET_SSS to enumerate that supervisor
shadow stacks are safe to use.  Because of how Xen lays out its shadow stacks,
fracturing is not expected to be a problem on native.

Detect this case on boot and default to not using shstk if virtualised.
Specifying `cet=shstk` on the command line will override this heuristic and
enable shadow stacks irrespective.

Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/docs/misc/xen-command-line.pandoc
+++ b/docs/misc/xen-command-line.pandoc
@@ -287,10 +287,15 @@ can be maintained with the pv-shim mecha
     protection.
 
     The option is available when `CONFIG_XEN_SHSTK` is compiled in, and
-    defaults to `true` on hardware supporting CET-SS.  Specifying
+    generally defaults to `true` on hardware supporting CET-SS.  Specifying
     `cet=no-shstk` will cause Xen not to use Shadow Stacks even when support
     is available in hardware.
 
+    Some hardware suffers from an issue known as Supervisor Shadow Stack
+    Fracturing.  On such hardware, Xen will default to not using Shadow Stacks
+    when virtualised.  Specifying `cet=shstk` will override this heuristic and
+    enable Shadow Stacks unilaterally.
+
 *   The `ibt=` boolean controls whether Xen uses Indirect Branch Tracking for
     its own protection.
 
--- a/tools/libs/light/libxl_cpuid.c
+++ b/tools/libs/light/libxl_cpuid.c
@@ -235,6 +235,8 @@ int libxl_cpuid_parse_config(libxl_cpuid
         {"fsrs",         0x00000007,  1, CPUID_REG_EAX, 11,  1},
         {"fsrcs",        0x00000007,  1, CPUID_REG_EAX, 12,  1},
 
+        {"cet-sss",      0x00000007,  1, CPUID_REG_EDX, 18,  1},
+
         {"intel-psfd",   0x00000007,  2, CPUID_REG_EDX,  0,  1},
         {"mcdt-no",      0x00000007,  2, CPUID_REG_EDX,  5,  1},
 
--- a/tools/misc/xen-cpuid.c
+++ b/tools/misc/xen-cpuid.c
@@ -208,6 +208,7 @@ static const char *const str_7c1[32] =
 
 static const char *const str_7d1[32] =
 {
+    [18] = "cet-sss",
 };
 
 static const char *const str_7d2[32] =
--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -346,11 +346,18 @@ void __init early_cpu_init(void)
               x86_cpuid_vendor_to_str(c->x86_vendor), c->x86, c->x86,
               c->x86_model, c->x86_model, c->x86_mask, eax);
 
-       if (c->cpuid_level >= 7)
-               cpuid_count(7, 0, &eax, &ebx,
+       if (c->cpuid_level >= 7) {
+               uint32_t max_subleaf;
+
+               cpuid_count(7, 0, &max_subleaf, &ebx,
                            &c->x86_capability[FEATURESET_7c0],
                            &c->x86_capability[FEATURESET_7d0]);
 
+               if (max_subleaf >= 1)
+                       cpuid_count(7, 1, &eax, &ebx, &ecx,
+                                   &c->x86_capability[FEATURESET_7d1]);
+       }
+
        eax = cpuid_eax(0x80000000);
        if ((eax >> 16) == 0x8000 && eax >= 0x80000008) {
                ebx = eax >= 0x8000001f ? cpuid_ebx(0x8000001f) : 0;
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -95,11 +95,7 @@ unsigned long __initdata highmem_start;
 size_param("highmem-start", highmem_start);
 #endif
 
-#ifdef CONFIG_XEN_SHSTK
-static bool __initdata opt_xen_shstk = true;
-#else
-#define opt_xen_shstk false
-#endif
+static int8_t __initdata opt_xen_shstk = -IS_ENABLED(CONFIG_XEN_SHSTK);
 
 #ifdef CONFIG_XEN_IBT
 static bool __initdata opt_xen_ibt = true;
@@ -1104,11 +1100,45 @@ void __init noreturn __start_xen(unsigne
     early_cpu_init();
 
     /* Choose shadow stack early, to set infrastructure up appropriately. */
-    if ( opt_xen_shstk && boot_cpu_has(X86_FEATURE_CET_SS) )
+    if ( !boot_cpu_has(X86_FEATURE_CET_SS) )
+        opt_xen_shstk = 0;
+
+    if ( opt_xen_shstk )
     {
-        printk("Enabling Supervisor Shadow Stacks\n");
+        /*
+         * Some CPUs suffer from Shadow Stack Fracturing, an issue whereby a
+         * fault/VMExit/etc between setting a Supervisor Busy bit and the
+         * event delivery completing renders the operation non-restartable.
+         * On restart, event delivery will find the Busy bit already set.
+         *
+         * This is a problem on bare metal, but outside of synthetic cases or
+         * a very badly timed #MC, it's not believed to be a problem.  It is a
+         * much bigger problem under virt, because we can VMExit for a number
+         * of legitimate reasons and tickle this bug.
+         *
+         * CPUs with this addressed enumerate CET-SSS to indicate that
+         * supervisor shadow stacks are now safe to use.
+         */
+        bool cpu_has_bug_shstk_fracture =
+            boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+            !boot_cpu_has(X86_FEATURE_CET_SSS);
 
-        setup_force_cpu_cap(X86_FEATURE_XEN_SHSTK);
+        /*
+         * On bare metal, assume that Xen won't be impacted by shstk
+         * fracturing problems.  Under virt, be more conservative and disable
+         * shstk by default.
+         */
+        if ( opt_xen_shstk == -1 )
+            opt_xen_shstk =
+                cpu_has_hypervisor ? !cpu_has_bug_shstk_fracture
+                                   : true;
+
+        if ( opt_xen_shstk )
+        {
+            printk("Enabling Supervisor Shadow Stacks\n");
+
+            setup_force_cpu_cap(X86_FEATURE_XEN_SHSTK);
+        }
     }
 
     if ( opt_xen_ibt && boot_cpu_has(X86_FEATURE_CET_IBT) )
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -298,6 +298,7 @@ XEN_CPUFEATURE(MCDT_NO,            13*32
 /* Intel-defined CPU features, CPUID level 0x00000007:1.ecx, word 14 */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:1.edx, word 15 */
+XEN_CPUFEATURE(CET_SSS,            15*32+18) /*   CET Supervisor Shadow Stacks 
safe to use */
 
 #endif /* XEN_CPUFEATURE */
 

++++++ 
63ebca9c-x86-spec-ctrl-Mitigate-Cross-Thread-Return-Address-Predictions.patch 
++++++
--- /var/tmp/diff_new_pack.p2IWAX/_old  2023-04-01 23:26:58.475329404 +0200
+++ /var/tmp/diff_new_pack.p2IWAX/_new  2023-04-01 23:26:58.479329425 +0200
@@ -9,11 +9,9 @@
 Reviewed-by: Jan Beulich <jbeul...@suse.com>
 (cherry picked from commit 63305e5392ec2d17b85e7996a97462744425db80)
 
-diff --git a/docs/misc/xen-command-line.pandoc 
b/docs/misc/xen-command-line.pandoc
-index 424b12cfb2..e7fe8b0cc9 100644
 --- a/docs/misc/xen-command-line.pandoc
 +++ b/docs/misc/xen-command-line.pandoc
-@@ -2343,7 +2343,7 @@ guests to use.
+@@ -2348,7 +2348,7 @@ guests to use.
    on entry and exit.  These blocks are necessary to virtualise support for
    guests and if disabled, guests will be unable to use IBRS/STIBP/SSBD/etc.
  * `rsb=` offers control over whether to overwrite the Return Stack Buffer /
@@ -22,11 +20,9 @@
  * `md-clear=` offers control over whether to use VERW to flush
    microarchitectural buffers on idle and exit from Xen.  *Note: For
    compatibility with development versions of this fix, `mds=` is also accepted
-diff --git a/xen/arch/x86/include/asm/cpufeatures.h 
b/xen/arch/x86/include/asm/cpufeatures.h
-index 865f110986..da0593de85 100644
 --- a/xen/arch/x86/include/asm/cpufeatures.h
 +++ b/xen/arch/x86/include/asm/cpufeatures.h
-@@ -35,7 +35,8 @@ XEN_CPUFEATURE(SC_RSB_HVM,        X86_SYNTH(19)) /* RSB 
overwrite needed for HVM
+@@ -35,7 +35,8 @@ XEN_CPUFEATURE(SC_RSB_HVM,        X86_SY
  XEN_CPUFEATURE(XEN_SELFSNOOP,     X86_SYNTH(20)) /* SELFSNOOP gets used by 
Xen itself */
  XEN_CPUFEATURE(SC_MSR_IDLE,       X86_SYNTH(21)) /* Clear MSR_SPEC_CTRL on 
idle */
  XEN_CPUFEATURE(XEN_LBR,           X86_SYNTH(22)) /* Xen uses MSR_DEBUGCTL.LBR 
*/
@@ -36,11 +32,9 @@
  XEN_CPUFEATURE(SC_VERW_IDLE,      X86_SYNTH(25)) /* VERW used by Xen for idle 
*/
  XEN_CPUFEATURE(XEN_SHSTK,         X86_SYNTH(26)) /* Xen uses CET Shadow 
Stacks */
  XEN_CPUFEATURE(XEN_IBT,           X86_SYNTH(27)) /* Xen uses CET Indirect 
Branch Tracking */
-diff --git a/xen/arch/x86/include/asm/spec_ctrl.h 
b/xen/arch/x86/include/asm/spec_ctrl.h
-index 6a77c39378..391973ef6a 100644
 --- a/xen/arch/x86/include/asm/spec_ctrl.h
 +++ b/xen/arch/x86/include/asm/spec_ctrl.h
-@@ -159,6 +159,21 @@ static always_inline void spec_ctrl_enter_idle(struct 
cpu_info *info)
+@@ -159,6 +159,21 @@ static always_inline void spec_ctrl_ente
       */
      alternative_input("", "verw %[sel]", X86_FEATURE_SC_VERW_IDLE,
                        [sel] "m" (info->verw_sel));
@@ -62,11 +56,9 @@
  }
  
  /* WARNING! `ret`, `call *`, `jmp *` not safe before this call. */
-diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
-index a320b81947..e80e2a5ed1 100644
 --- a/xen/arch/x86/spec_ctrl.c
 +++ b/xen/arch/x86/spec_ctrl.c
-@@ -1327,13 +1327,38 @@ void __init init_speculation_mitigations(void)
+@@ -1327,13 +1327,38 @@ void __init init_speculation_mitigations
       * 3) Some CPUs have RSBs which are not full width, which allow the
       *    attacker's entries to alias Xen addresses.
       *

++++++ 63f4d045-x86-ucode-AMD-apply-early-on-all-threads.patch ++++++
# Commit f4ef8a41b80831db2136bdaff9f946a1a4b051e7
# Date 2023-02-21 15:08:05 +0100
# Author Sergey Dyasli <sergey.dya...@citrix.com>
# Committer Jan Beulich <jbeul...@suse.com>
x86/ucode/AMD: apply the patch early on every logical thread

The original issue has been reported on AMD Bulldozer-based CPUs where
ucode loading loses the LWP feature bit in order to gain the IBPB bit.
LWP disabling is per-SMT/CMT core modification and needs to happen on
each sibling thread despite the shared microcode engine. Otherwise,
logical CPUs will end up with different cpuid capabilities.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216211

Guests running under Xen happen to be not affected because of levelling
logic for the feature masking/override MSRs which causes the LWP bit to
fall out and hides the issue. The latest recommendation from AMD, after
discussing this bug, is to load ucode on every logical CPU.

In Linux kernel this issue has been addressed by e7ad18d1169c
("x86/microcode/AMD: Apply the patch early on every logical thread").
Follow the same approach in Xen.

Introduce SAME_UCODE match result and use it for early AMD ucode
loading. Take this opportunity and move opt_ucode_allow_same out of
compare_revisions() to the relevant callers and also modify the warning
message based on it. Intel's side of things is modified for consistency
but provides no functional change.

Signed-off-by: Sergey Dyasli <sergey.dya...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/cpu/microcode/amd.c
+++ b/xen/arch/x86/cpu/microcode/amd.c
@@ -176,8 +176,8 @@ static enum microcode_match_result compa
     if ( new_rev > old_rev )
         return NEW_UCODE;
 
-    if ( opt_ucode_allow_same && new_rev == old_rev )
-        return NEW_UCODE;
+    if ( new_rev == old_rev )
+        return SAME_UCODE;
 
     return OLD_UCODE;
 }
@@ -220,8 +220,13 @@ static int cf_check apply_microcode(cons
     unsigned int cpu = smp_processor_id();
     struct cpu_signature *sig = &per_cpu(cpu_sig, cpu);
     uint32_t rev, old_rev = sig->rev;
+    enum microcode_match_result result = microcode_fits(patch);
 
-    if ( microcode_fits(patch) != NEW_UCODE )
+    /*
+     * Allow application of the same revision to pick up SMT-specific changes
+     * even if the revision of the other SMT thread is already up-to-date.
+     */
+    if ( result != NEW_UCODE && result != SAME_UCODE )
         return -EINVAL;
 
     if ( check_final_patch_levels(sig) )
--- a/xen/arch/x86/cpu/microcode/core.c
+++ b/xen/arch/x86/cpu/microcode/core.c
@@ -610,17 +610,25 @@ static long cf_check microcode_update_he
      * that ucode revision.
      */
     spin_lock(&microcode_mutex);
-    if ( microcode_cache &&
-         alternative_call(ucode_ops.compare_patch,
-                          patch, microcode_cache) != NEW_UCODE )
+    if ( microcode_cache )
     {
-        spin_unlock(&microcode_mutex);
-        printk(XENLOG_WARNING "microcode: couldn't find any newer revision "
-                              "in the provided blob!\n");
-        microcode_free_patch(patch);
-        ret = -ENOENT;
+        enum microcode_match_result result;
 
-        goto put;
+        result = alternative_call(ucode_ops.compare_patch, patch,
+                                  microcode_cache);
+
+        if ( result != NEW_UCODE &&
+             !(opt_ucode_allow_same && result == SAME_UCODE) )
+        {
+            spin_unlock(&microcode_mutex);
+            printk(XENLOG_WARNING
+                   "microcode: couldn't find any newer%s revision in the 
provided blob!\n",
+                   opt_ucode_allow_same ? " (or the same)" : "");
+            microcode_free_patch(patch);
+            ret = -ENOENT;
+
+            goto put;
+        }
     }
     spin_unlock(&microcode_mutex);
 
--- a/xen/arch/x86/cpu/microcode/intel.c
+++ b/xen/arch/x86/cpu/microcode/intel.c
@@ -232,8 +232,8 @@ static enum microcode_match_result compa
     if ( new_rev > old_rev )
         return NEW_UCODE;
 
-    if ( opt_ucode_allow_same && new_rev == old_rev )
-        return NEW_UCODE;
+    if ( new_rev == old_rev )
+        return SAME_UCODE;
 
     /*
      * Treat pre-production as always applicable - anyone using pre-production
@@ -290,8 +290,12 @@ static int cf_check apply_microcode(cons
     unsigned int cpu = smp_processor_id();
     struct cpu_signature *sig = &this_cpu(cpu_sig);
     uint32_t rev, old_rev = sig->rev;
+    enum microcode_match_result result;
+
+    result = microcode_update_match(patch);
 
-    if ( microcode_update_match(patch) != NEW_UCODE )
+    if ( result != NEW_UCODE &&
+         !(opt_ucode_allow_same && result == SAME_UCODE) )
         return -EINVAL;
 
     wbinvd();
--- a/xen/arch/x86/cpu/microcode/private.h
+++ b/xen/arch/x86/cpu/microcode/private.h
@@ -6,7 +6,8 @@
 extern bool opt_ucode_allow_same;
 
 enum microcode_match_result {
-    OLD_UCODE, /* signature matched, but revision id is older or equal */
+    OLD_UCODE, /* signature matched, but revision id is older */
+    SAME_UCODE, /* signature matched, but revision id is the same */
     NEW_UCODE, /* signature matched, but revision id is newer */
     MIS_UCODE, /* signature mismatched */
 };

++++++ 63fe06e0-x86-ucode-AMD-apply-late-on-all-threads.patch ++++++
# Commit f1315e48a03a42f78f9b03c0a384165baf02acae
# Date 2023-02-28 14:51:28 +0100
# Author Sergey Dyasli <sergey.dya...@citrix.com>
# Committer Jan Beulich <jbeul...@suse.com>
x86/ucode/AMD: late load the patch on every logical thread

Currently late ucode loading is performed only on the first core of CPU
siblings.  But according to the latest recommendation from AMD, late
ucode loading should happen on every logical thread/core on AMD CPUs.

To achieve that, introduce is_cpu_primary() helper which will consider
every logical cpu as "primary" when running on AMD CPUs.  Also include
Hygon in the check for future-proofing.

Signed-off-by: Sergey Dyasli <sergey.dya...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/cpu/microcode/core.c
+++ b/xen/arch/x86/cpu/microcode/core.c
@@ -274,6 +274,20 @@ static bool microcode_update_cache(struc
     return true;
 }
 
+/* Returns true if ucode should be loaded on a given cpu */
+static bool is_cpu_primary(unsigned int cpu)
+{
+    if ( boot_cpu_data.x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON) )
+        /* Load ucode on every logical thread/core */
+        return true;
+
+    /* Intel CPUs should load ucode only on the first core of SMT siblings */
+    if ( cpu == cpumask_first(per_cpu(cpu_sibling_mask, cpu)) )
+        return true;
+
+    return false;
+}
+
 /* Wait for a condition to be met with a timeout (us). */
 static int wait_for_condition(bool (*func)(unsigned int data),
                               unsigned int data, unsigned int timeout)
@@ -380,7 +394,7 @@ static int primary_thread_work(const str
 static int cf_check microcode_nmi_callback(
     const struct cpu_user_regs *regs, int cpu)
 {
-    unsigned int primary = cpumask_first(this_cpu(cpu_sibling_mask));
+    bool primary_cpu = is_cpu_primary(cpu);
     int ret;
 
     /* System-generated NMI, leave to main handler */
@@ -393,10 +407,10 @@ static int cf_check microcode_nmi_callba
      * ucode_in_nmi.
      */
     if ( cpu == cpumask_first(&cpu_online_map) ||
-         (!ucode_in_nmi && cpu == primary) )
+         (!ucode_in_nmi && primary_cpu) )
         return 0;
 
-    if ( cpu == primary )
+    if ( primary_cpu )
         ret = primary_thread_work(nmi_patch);
     else
         ret = secondary_nmi_work();
@@ -547,7 +561,7 @@ static int cf_check do_microcode_update(
      */
     if ( cpu == cpumask_first(&cpu_online_map) )
         ret = control_thread_fn(patch);
-    else if ( cpu == cpumask_first(this_cpu(cpu_sibling_mask)) )
+    else if ( is_cpu_primary(cpu) )
         ret = primary_thread_fn(patch);
     else
         ret = secondary_thread_fn();
@@ -640,7 +654,7 @@ static long cf_check microcode_update_he
     /* Calculate the number of online CPU core */
     nr_cores = 0;
     for_each_online_cpu(cpu)
-        if ( cpu == cpumask_first(per_cpu(cpu_sibling_mask, cpu)) )
+        if ( is_cpu_primary(cpu) )
             nr_cores++;
 
     printk(XENLOG_INFO "%u cores are to update their microcode\n", nr_cores);

++++++ 640f3035-x86-altp2m-help-gcc13.patch ++++++
# Commit be62b1fc2aa7375d553603fca07299da765a89fe
# Date 2023-03-13 15:16:21 +0100
# Author Jan Beulich <jbeul...@suse.com>
# Committer Jan Beulich <jbeul...@suse.com>
x86/altp2m: help gcc13 to avoid it emitting a warning

Switches of altp2m-s always expect a valid altp2m to be in place (and
indeed altp2m_vcpu_initialise() sets the active one to be at index 0).
The compiler, however, cannot know that, and hence it cannot eliminate
p2m_get_altp2m()'s case of returnin (literal) NULL. If then the compiler
decides to special case that code path in the caller, the dereference in
instances of

    atomic_dec(&p2m_get_altp2m(v)->active_vcpus);

can, to the code generator, appear to be NULL dereferences, leading to

In function 'atomic_dec',
    inlined from '...' at ...:
./arch/x86/include/asm/atomic.h:182:5: error: array subscript 0 is outside 
array bounds of 'int[0]' [-Werror=array-bounds=]

Aid the compiler by adding a BUG_ON() checking the return value of the
problematic p2m_get_altp2m(). Since with the use of the local variable
the 2nd p2m_get_altp2m() each will look questionable at the first glance
(Why is the local variable not used here?), open-code the only relevant
piece of p2m_get_altp2m() there.

To avoid repeatedly doing these transformations, and also to limit how
"bad" the open-coding really is, convert the entire operation to an
inline helper, used by all three instances (and accepting the redundant
BUG_ON(idx >= MAX_ALTP2M) in two of the three cases).

Reported-by: Charles Arnold <carn...@suse.com>
Signed-off-by: Jan Beulich <jbeul...@suse.com>
Acked-by: Andrew Cooper <andrew.coop...@citrix.com>

--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -4072,13 +4072,7 @@ void vmx_vmexit_handler(struct cpu_user_
             }
         }
 
-        if ( idx != vcpu_altp2m(v).p2midx )
-        {
-            BUG_ON(idx >= MAX_ALTP2M);
-            atomic_dec(&p2m_get_altp2m(v)->active_vcpus);
-            vcpu_altp2m(v).p2midx = idx;
-            atomic_inc(&p2m_get_altp2m(v)->active_vcpus);
-        }
+        p2m_set_altp2m(v, idx);
     }
 
     if ( unlikely(currd->arch.monitor.vmexit_enabled) )
--- a/xen/arch/x86/include/asm/p2m.h
+++ b/xen/arch/x86/include/asm/p2m.h
@@ -879,6 +879,26 @@ static inline struct p2m_domain *p2m_get
     return v->domain->arch.altp2m_p2m[index];
 }
 
+/* set current alternate p2m table */
+static inline bool p2m_set_altp2m(struct vcpu *v, unsigned int idx)
+{
+    struct p2m_domain *orig;
+
+    BUG_ON(idx >= MAX_ALTP2M);
+
+    if ( idx == vcpu_altp2m(v).p2midx )
+        return false;
+
+    orig = p2m_get_altp2m(v);
+    BUG_ON(!orig);
+    atomic_dec(&orig->active_vcpus);
+
+    vcpu_altp2m(v).p2midx = idx;
+    atomic_inc(&v->domain->arch.altp2m_p2m[idx]->active_vcpus);
+
+    return true;
+}
+
 /* Switch alternate p2m for a single vcpu */
 bool_t p2m_switch_vcpu_altp2m_by_id(struct vcpu *v, unsigned int idx);
 
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -1787,13 +1787,8 @@ bool_t p2m_switch_vcpu_altp2m_by_id(stru
 
     if ( d->arch.altp2m_eptp[idx] != mfn_x(INVALID_MFN) )
     {
-        if ( idx != vcpu_altp2m(v).p2midx )
-        {
-            atomic_dec(&p2m_get_altp2m(v)->active_vcpus);
-            vcpu_altp2m(v).p2midx = idx;
-            atomic_inc(&p2m_get_altp2m(v)->active_vcpus);
+        if ( p2m_set_altp2m(v, idx) )
             altp2m_vcpu_update_p2m(v);
-        }
         rc = 1;
     }
 
@@ -2070,13 +2065,8 @@ int p2m_switch_domain_altp2m_by_id(struc
     if ( d->arch.altp2m_visible_eptp[idx] != mfn_x(INVALID_MFN) )
     {
         for_each_vcpu( d, v )
-            if ( idx != vcpu_altp2m(v).p2midx )
-            {
-                atomic_dec(&p2m_get_altp2m(v)->active_vcpus);
-                vcpu_altp2m(v).p2midx = idx;
-                atomic_inc(&p2m_get_altp2m(v)->active_vcpus);
+            if ( p2m_set_altp2m(v, idx) )
                 altp2m_vcpu_update_p2m(v);
-            }
 
         rc = 0;
     }

++++++ 641041e8-VT-d-constrain-IGD-check.patch ++++++
# Commit f8c4317295fa1cde1a81779b7e362651c084efb8
# Date 2023-03-14 10:44:08 +0100
# Author Jan Beulich <jbeul...@suse.com>
# Committer Jan Beulich <jbeul...@suse.com>
VT-d: constrain IGD check

Marking a DRHD as controlling an IGD isn't very sensible without
checking that at the very least it's a graphics device that lives at
0000:00:02.0. Re-use the reading of the class-code to control both the
clearing of "gfx_only" and the setting of "igd_drhd_address".

Signed-off-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: Kevin Tian <kevin.t...@intel.com>

--- a/xen/drivers/passthrough/vtd/dmar.c
+++ b/xen/drivers/passthrough/vtd/dmar.c
@@ -391,15 +391,12 @@ static int __init acpi_parse_dev_scope(
 
             if ( drhd )
             {
-                if ( (seg == 0) && (bus == 0) && (path->dev == 2) &&
-                     (path->fn == 0) )
-                    igd_drhd_address = drhd->address;
-
-                if ( gfx_only &&
-                     pci_conf_read8(PCI_SBDF(seg, bus, path->dev, path->fn),
+                if ( pci_conf_read8(PCI_SBDF(seg, bus, path->dev, path->fn),
                                     PCI_CLASS_DEVICE + 1) != 0x03
                                     /* PCI_BASE_CLASS_DISPLAY */ )
                     gfx_only = false;
+                else if ( !seg && !bus && path->dev == 2 && !path->fn )
+                    igd_drhd_address = drhd->address;
             }
 
             break;

++++++ 64104238-bunzip-gcc13.patch ++++++
# Commit 402195e56de0aacf97e05c80ed367d464ca6938b
# Date 2023-03-14 10:45:28 +0100
# Author Jan Beulich <jbeul...@suse.com>
# Committer Jan Beulich <jbeul...@suse.com>
bunzip: work around gcc13 warning

While provable that length[0] is always initialized (because symCount
cannot be zero), upcoming gcc13 fails to recognize this and warns about
the unconditional use of the value immediately following the loop.

See also https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106511.

Reported-by: Martin Liška <martin.li...@suse.com>
Signed-off-by: Jan Beulich <jbeul...@suse.com>
Acked-by: Andrew Cooper <andrew.coop...@citrix.com>

--- a/xen/common/bunzip2.c
+++ b/xen/common/bunzip2.c
@@ -233,6 +233,11 @@ static int __init get_next_block(struct
                   becomes negative, so an unsigned inequality catches
                   it.) */
                t = get_bits(bd, 5)-1;
+               /* GCC 13 has apparently improved use-before-set detection, but
+                  it can't figure out that length[0] is always intialized by
+                  virtue of symCount always being positive when making it here.
+                  See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106511. */
+               length[0] = 0;
                for (i = 0; i < symCount; i++) {
                        for (;;) {
                                if (((unsigned)t) > (MAX_HUFCODE_BITS-1))

++++++ 6419697d-AMD-IOMMU-no-XT-x2APIC-phys.patch ++++++
# Commit 0d2686f6b66b4b1b3c72c3525083b0ce02830054
# Date 2023-03-21 09:23:25 +0100
# Author Jan Beulich <jbeul...@suse.com>
# Committer Jan Beulich <jbeul...@suse.com>
AMD/IOMMU: without XT, x2APIC needs to be forced into physical mode

An earlier change with the same title (commit 1ba66a870eba) altered only
the path where x2apic_phys was already set to false (perhaps from the
command line). The same of course needs applying when the variable
wasn't modified yet from its initial value.

Reported-by: Elliott Mitchell <ehem+...@m5p.com>
Signed-off-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: Andrew Cooper <andrew.coop...@citrix.com>

--- a/xen/arch/x86/genapic/x2apic.c
+++ b/xen/arch/x86/genapic/x2apic.c
@@ -236,11 +236,11 @@ const struct genapic *__init apic_x2apic
     if ( x2apic_phys < 0 )
     {
         /*
-         * Force physical mode if there's no interrupt remapping support: The
-         * ID in clustered mode requires a 32 bit destination field due to
+         * Force physical mode if there's no (full) interrupt remapping 
support:
+         * The ID in clustered mode requires a 32 bit destination field due to
          * the usage of the high 16 bits to hold the cluster ID.
          */
-        x2apic_phys = !iommu_intremap ||
+        x2apic_phys = iommu_intremap != iommu_intremap_full ||
                       (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) ||
                       (IS_ENABLED(CONFIG_X2APIC_PHYSICAL) &&
                        !(acpi_gbl_FADT.flags & ACPI_FADT_APIC_CLUSTER));

++++++ 64199e0c-x86-shadow-account-for-log-dirty-mode.patch ++++++
# Commit 33fb3a6612235a23b30df556bf92b52a3450c340
# Date 2023-03-21 12:07:41 +0000
# Author Jan Beulich <jbeul...@suse.com>
# Committer Andrew Cooper <andrew.coop...@citrix.com>
x86/shadow: account for log-dirty mode when pre-allocating

Pre-allocation is intended to ensure that in the course of constructing
or updating shadows there won't be any risk of just made shadows or
shadows being acted upon can disappear under our feet. The amount of
pages pre-allocated then, however, needs to account for all possible
subsequent allocations. While the use in sh_page_fault() accounts for
all shadows which may need making, so far it didn't account for
allocations coming from log-dirty tracking (which piggybacks onto the
P2M allocation functions).

Since shadow_prealloc() takes a count of shadows (or other data
structures) rather than a count of pages, putting the adjustment at the
call site of this function won't work very well: We simply can't express
the correct count that way in all cases. Instead take care of this in
the function itself, by "snooping" for L1 type requests. (While not
applicable right now, future new request sites of L1 tables would then
also be covered right away.)

It is relevant to note here that pre-allocations like the one done from
shadow_alloc_p2m_page() are benign when they fall in the "scope" of an
earlier pre-alloc which already included that count: The inner call will
simply find enough pages available then; it'll bail right away.

This is CVE-2022-42332 / XSA-427.

Signed-off-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: Tim Deegan <t...@xen.org>

--- a/xen/arch/x86/include/asm/paging.h
+++ b/xen/arch/x86/include/asm/paging.h
@@ -190,6 +190,10 @@ bool paging_mfn_is_dirty(const struct do
 #define L4_LOGDIRTY_IDX(pfn) ((pfn_x(pfn) >> (PAGE_SHIFT + 3 + PAGETABLE_ORDER 
* 2)) & \
                               (LOGDIRTY_NODE_ENTRIES-1))
 
+#define paging_logdirty_levels() \
+    (DIV_ROUND_UP(PADDR_BITS - PAGE_SHIFT - (PAGE_SHIFT + 3), \
+                  PAGE_SHIFT - ilog2(sizeof(mfn_t))) + 1)
+
 #ifdef CONFIG_HVM
 /* VRAM dirty tracking support */
 struct sh_dirty_vram {
--- a/xen/arch/x86/mm/paging.c
+++ b/xen/arch/x86/mm/paging.c
@@ -282,6 +282,7 @@ void paging_mark_pfn_dirty(struct domain
     if ( unlikely(!VALID_M2P(pfn_x(pfn))) )
         return;
 
+    BUILD_BUG_ON(paging_logdirty_levels() != 4);
     i1 = L1_LOGDIRTY_IDX(pfn);
     i2 = L2_LOGDIRTY_IDX(pfn);
     i3 = L3_LOGDIRTY_IDX(pfn);
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -1015,7 +1015,17 @@ bool shadow_prealloc(struct domain *d, u
     if ( unlikely(d->is_dying) )
        return false;
 
-    ret = _shadow_prealloc(d, shadow_size(type) * count);
+    count *= shadow_size(type);
+    /*
+     * Log-dirty handling may result in allocations when populating its
+     * tracking structures.  Tie this to the caller requesting space for L1
+     * shadows.
+     */
+    if ( paging_mode_log_dirty(d) &&
+         ((SHF_L1_ANY | SHF_FL1_ANY) & (1u << type)) )
+        count += paging_logdirty_levels();
+
+    ret = _shadow_prealloc(d, count);
     if ( !ret && (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) )
         /*
          * Failing to allocate memory required for shadow usage can only 
result in

++++++ 64199e0d-x86-HVM-bound-number-of-pca-regions.patch ++++++
# Commit d484dcca7972794ad70cefd3af73fe0328d610f1
# Date 2023-03-21 12:07:41 +0000
# Author Jan Beulich <jbeul...@suse.com>
# Committer Andrew Cooper <andrew.coop...@citrix.com>
x86/HVM: bound number of pinned cache attribute regions

This is exposed via DMOP, i.e. to potentially not fully privileged
device models. With that we may not permit registration of an (almost)
unbounded amount of such regions.

This is CVE-2022-42333 / part of XSA-428.

Fixes: 642123c5123f ("x86/hvm: provide XEN_DMOP_pin_memory_cacheattr")
Signed-off-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: Andrew Cooper <andrew.coop...@citrix.com>

--- a/xen/arch/x86/hvm/mtrr.c
+++ b/xen/arch/x86/hvm/mtrr.c
@@ -595,6 +595,7 @@ int hvm_set_mem_pinned_cacheattr(struct
                                  uint64_t gfn_end, uint32_t type)
 {
     struct hvm_mem_pinned_cacheattr_range *range;
+    unsigned int nr = 0;
     int rc = 1;
 
     if ( !is_hvm_domain(d) )
@@ -666,11 +667,15 @@ int hvm_set_mem_pinned_cacheattr(struct
             rc = -EBUSY;
             break;
         }
+        ++nr;
     }
     rcu_read_unlock(&pinned_cacheattr_rcu_lock);
     if ( rc <= 0 )
         return rc;
 
+    if ( nr >= 64 /* The limit is arbitrary. */ )
+        return -ENOSPC;
+
     range = xzalloc(struct hvm_mem_pinned_cacheattr_range);
     if ( range == NULL )
         return -ENOMEM;

++++++ 64199e0e-x86-HVM-serialize-pca-list-manipulation.patch ++++++
# Commit ab2d47eb135380105b32a2a419f1e19445e19fc1
# Date 2023-03-21 12:07:42 +0000
# Author Jan Beulich <jbeul...@suse.com>
# Committer Andrew Cooper <andrew.coop...@citrix.com>
x86/HVM: serialize pinned cache attribute list manipulation

While the RCU variants of list insertion and removal allow lockless list
traversal (with RCU just read-locked), insertions and removals still
need serializing amongst themselves. To keep things simple, use the
domain lock for this purpose.

This is CVE-2022-42334 / part of XSA-428.

Fixes: 642123c5123f ("x86/hvm: provide XEN_DMOP_pin_memory_cacheattr")
Signed-off-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: Julien Grall <jgr...@amazon.com>

--- a/xen/arch/x86/hvm/mtrr.c
+++ b/xen/arch/x86/hvm/mtrr.c
@@ -594,7 +594,7 @@ static void cf_check free_pinned_cacheat
 int hvm_set_mem_pinned_cacheattr(struct domain *d, uint64_t gfn_start,
                                  uint64_t gfn_end, uint32_t type)
 {
-    struct hvm_mem_pinned_cacheattr_range *range;
+    struct hvm_mem_pinned_cacheattr_range *range, *newr;
     unsigned int nr = 0;
     int rc = 1;
 
@@ -608,14 +608,15 @@ int hvm_set_mem_pinned_cacheattr(struct
     {
     case XEN_DOMCTL_DELETE_MEM_CACHEATTR:
         /* Remove the requested range. */
-        rcu_read_lock(&pinned_cacheattr_rcu_lock);
-        list_for_each_entry_rcu ( range,
-                                  &d->arch.hvm.pinned_cacheattr_ranges,
-                                  list )
+        domain_lock(d);
+        list_for_each_entry ( range,
+                              &d->arch.hvm.pinned_cacheattr_ranges,
+                              list )
             if ( range->start == gfn_start && range->end == gfn_end )
             {
-                rcu_read_unlock(&pinned_cacheattr_rcu_lock);
                 list_del_rcu(&range->list);
+                domain_unlock(d);
+
                 type = range->type;
                 call_rcu(&range->rcu, free_pinned_cacheattr_entry);
                 p2m_memory_type_changed(d);
@@ -636,7 +637,7 @@ int hvm_set_mem_pinned_cacheattr(struct
                 }
                 return 0;
             }
-        rcu_read_unlock(&pinned_cacheattr_rcu_lock);
+        domain_unlock(d);
         return -ENOENT;
 
     case PAT_TYPE_UC_MINUS:
@@ -651,7 +652,10 @@ int hvm_set_mem_pinned_cacheattr(struct
         return -EINVAL;
     }
 
-    rcu_read_lock(&pinned_cacheattr_rcu_lock);
+    newr = xzalloc(struct hvm_mem_pinned_cacheattr_range);
+
+    domain_lock(d);
+
     list_for_each_entry_rcu ( range,
                               &d->arch.hvm.pinned_cacheattr_ranges,
                               list )
@@ -669,27 +673,34 @@ int hvm_set_mem_pinned_cacheattr(struct
         }
         ++nr;
     }
-    rcu_read_unlock(&pinned_cacheattr_rcu_lock);
+
     if ( rc <= 0 )
-        return rc;
+        /* nothing */;
+    else if ( nr >= 64 /* The limit is arbitrary. */ )
+        rc = -ENOSPC;
+    else if ( !newr )
+        rc = -ENOMEM;
+    else
+    {
+        newr->start = gfn_start;
+        newr->end = gfn_end;
+        newr->type = type;
 
-    if ( nr >= 64 /* The limit is arbitrary. */ )
-        return -ENOSPC;
+        list_add_rcu(&newr->list, &d->arch.hvm.pinned_cacheattr_ranges);
+
+        newr = NULL;
+        rc = 0;
+    }
 
-    range = xzalloc(struct hvm_mem_pinned_cacheattr_range);
-    if ( range == NULL )
-        return -ENOMEM;
+    domain_unlock(d);
 
-    range->start = gfn_start;
-    range->end = gfn_end;
-    range->type = type;
+    xfree(newr);
 
-    list_add_rcu(&range->list, &d->arch.hvm.pinned_cacheattr_ranges);
     p2m_memory_type_changed(d);
     if ( type != PAT_TYPE_WRBACK )
         flush_all(FLUSH_CACHE);
 
-    return 0;
+    return rc;
 }
 
 static int cf_check hvm_save_mtrr_msr(struct vcpu *v, hvm_domain_context_t *h)

++++++ 64199e0f-x86-spec-ctrl-defer-CR4_PV32_RESTORE-for-CSTAR.patch ++++++
# Commit 245d030f4aa79f766e575684127f86748c63bb32
# Date 2023-03-21 12:07:42 +0000
# Author Andrew Cooper <andrew.coop...@citrix.com>
# Committer Andrew Cooper <andrew.coop...@citrix.com>
x86/spec-ctrl: Defer CR4_PV32_RESTORE on the cstar_enter path

As stated (correctly) by the comment next to SPEC_CTRL_ENTRY_FROM_PV, between
the two hunks visible in the patch, RET's are not safe prior to this point.

CR4_PV32_RESTORE hides a CALL/RET pair in certain configurations (PV32
compiled in, SMEP or SMAP active), and the RET can be attacked with one of
several known speculative issues.

Furthermore, CR4_PV32_RESTORE also hides a reference to the cr4_pv32_mask
global variable, which is not safe when XPTI is active before restoring Xen's
full pagetables.

This crash has gone unnoticed because it is only AMD CPUs which permit the
SYSCALL instruction in compatibility mode, and these are not vulnerable to
Meltdown so don't activate XPTI by default.

This is XSA-429 / CVE-2022-42331

Fixes: 5e7962901131 ("x86/entry: Organise the use of MSR_SPEC_CTRL at each 
entry/exit point")
Fixes: 5784de3e2067 ("x86: Meltdown band-aid against malicious 64-bit PV 
guests")
Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -288,7 +288,6 @@ ENTRY(cstar_enter)
         ALTERNATIVE "", "setssbsy", X86_FEATURE_XEN_SHSTK
 #endif
         push  %rax          /* Guest %rsp */
-        CR4_PV32_RESTORE
         movq  8(%rsp), %rax /* Restore guest %rax. */
         movq  $FLAT_USER_SS32, 8(%rsp) /* Assume a 64bit domain.  Compat 
handled lower. */
         pushq %r11
@@ -312,6 +311,8 @@ ENTRY(cstar_enter)
 .Lcstar_cr3_okay:
         sti
 
+        CR4_PV32_RESTORE
+
         movq  STACK_CPUINFO_FIELD(current_vcpu)(%rbx), %rbx
 
 #ifdef CONFIG_PV32

++++++ libxl.fix-guest-kexec-skip-cpuid-policy.patch ++++++
From: Jason Andryuk <jandr...@gmail.com>
Date: Thu, 26 Jan 2023 10:58:23 +0100
Subject: libxl: fix guest kexec - skip cpuid policy
Git-commit: 1e454c2b5b1172e0fc7457e411ebaba61db8fc87
References: bsc#1209245

When a domain performs a kexec (soft reset), libxl__build_pre() is
called with the existing domid.  Calling libxl__cpuid_legacy() on the
existing domain fails since the cpuid policy has already been set, and
the guest isn't rebuilt and doesn't kexec.

xc: error: Failed to set d1's policy (err leaf 0xffffffff, subleaf 0xffffffff, 
msr 0xffffffff) (17 = File exists): Internal error
libxl: error: libxl_cpuid.c:494:libxl__cpuid_legacy: Domain 1:Failed to apply 
CPUID policy: File exists
libxl: error: libxl_create.c:1641:domcreate_rebuild_done: Domain 1:cannot 
(re-)build domain: -3
libxl: error: libxl_xshelp.c:201:libxl__xs_read_mandatory: xenstore read 
failed: `/libxl/1/type': No such file or directory
libxl: warning: libxl_dom.c:49:libxl__domain_type: unable to get domain type 
for domid=1, assuming HVM

During a soft_reset, skip calling libxl__cpuid_legacy() to avoid the
issue.  Before commit 34990446ca91, the libxl__cpuid_legacy() failure
would have been ignored, so kexec would continue.

Fixes: 34990446ca91 ("libxl: don't ignore the return value from 
xc_cpuid_apply_policy")
Signed-off-by: Jason Andryuk <jandr...@gmail.com>
Reviewed-by: Anthony PERARD <anthony.per...@citrix.com>
---
 tools/libs/light/libxl_create.c   | 2 ++
 tools/libs/light/libxl_dom.c      | 2 +-
 tools/libs/light/libxl_internal.h | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

--- a/tools/libs/light/libxl_create.c
+++ b/tools/libs/light/libxl_create.c
@@ -2207,6 +2207,8 @@ static int do_domain_soft_reset(libxl_ctx *ctx,
                               aop_console_how);
     cdcs->domid_out = &domid_out;
 
+    state->soft_reset = true;
+
     dom_path = libxl__xs_get_dompath(gc, domid);
     if (!dom_path) {
         LOGD(ERROR, domid, "failed to read domain path");
--- a/tools/libs/light/libxl_dom.c
+++ b/tools/libs/light/libxl_dom.c
@@ -382,7 +382,7 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
     /* Construct a CPUID policy, but only for brand new domains.  Domains
      * being migrated-in/restored have CPUID handled during the
      * static_data_done() callback. */
-    if (!state->restore)
+    if (!state->restore && !state->soft_reset)
         rc = libxl__cpuid_legacy(ctx, domid, false, info);
 
 out:
--- a/tools/libs/light/libxl_internal.h
+++ b/tools/libs/light/libxl_internal.h
@@ -1411,6 +1411,7 @@ typedef struct {
     /* Whether this domain is being migrated/restored, or booting fresh.  Only
      * applicable to the primary domain, not support domains (e.g. stub QEMU). 
*/
     bool restore;
+    bool soft_reset;
 } libxl__domain_build_state;
 
 _hidden void libxl__domain_build_state_init(libxl__domain_build_state *s);

Reply via email to