From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86/spec-ctrl: Add spec-ctrl=unpriv-mmio

Per Xen's support statement, PCI passthrough should be to trusted domains
because the overall system security depends on factors outside of Xen's
control.

As such, Xen, in a supported configuration, is not vulnerable to DRPW/SBDR.

However, users who have risk assessed their configuration may be happy with
the risk of DoS, but unhappy with the risk of cross-domain data leakage.  Such
users should enable this option.

When enabled, and when running with on impacted systems with the Intel May
2022 microcode, this option mitigates cross-domain fill buffer leakage by
flushing on entry to any domain with MMIO access, and engages the existing
srb-lock to protect RNG data.

This is part of XSA-404.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>

diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
index 476942a48ba0..adfd4ad39941 100644
--- a/docs/misc/xen-command-line.pandoc
+++ b/docs/misc/xen-command-line.pandoc
@@ -2259,7 +2259,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`).
 ### spec-ctrl (x86)
 > `= List of [ <bool>, xen=<bool>, {pv,hvm,msr-sc,rsb,md-clear}=<bool>,
 >              bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu,
->              l1d-flush,branch-harden,srb-lock}=<bool> ]`
+>              l1d-flush,branch-harden,srb-lock,unpriv-mmio}=<bool> ]`
 
 Controls for speculative execution sidechannel mitigations.  By default, Xen
 will pick the most appropriate mitigations based on compiled in support,
@@ -2339,8 +2339,18 @@ Xen will enable this mitigation.
 On hardware supporting SRBDS_CTRL, the `srb-lock=` option can be used to force
 or prevent Xen from protect the Special Register Buffer from leaking stale
 data. By default, Xen will enable this mitigation, except on parts where MDS
-is fixed and TAA is fixed/mitigated (in which case, there is believed to be no
-way for an attacker to obtain the stale data).
+is fixed and TAA is fixed/mitigated and there are no unprivileged MMIO
+mappings (in which case, there is believed to be no way for an attacker to
+obtain stale data).
+
+The `unpriv-mmio=` boolean indicates whether the system has (or will have)
+less than fully privileged domains with access to MMIO devices.  Per Xen's
+support statement PCI Passthrough should be to trusted guests only, but users
+who have risk-assessed further might be happy to tolerate the risk of DoS, but
+not of data leakage.  Such user should enable this option.  When enabled, this
+option uses `FB_CLEAR` and/or `SRBDS_CTRL` functionality available in the
+Intel May 2022 microcode release to mitigate cross-domain leakage of fill
+buffers via the MMIO Stale Data vulnerabilities.
 
 ### sync_console
 > `= <boolean>`
diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
index ded33f5d0f2e..c19e82e07143 100644
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -67,6 +67,8 @@ static bool __initdata cpu_has_bug_msbds_only; /* => minimal HT impact. */
 static bool __initdata cpu_has_bug_mds; /* Any other M{LP,SB,FB}DS combination. */
 
 static int8_t __initdata opt_srb_lock = -1;
+static bool __ro_after_init opt_unpriv_mmio;
+static bool __ro_after_init opt_fb_clear_mmio;
 
 static int __init cf_check parse_spec_ctrl(const char *s)
 {
@@ -184,6 +186,8 @@ static int __init cf_check parse_spec_ctrl(const char *s)
             opt_branch_harden = val;
         else if ( (val = parse_boolean("srb-lock", s, ss)) >= 0 )
             opt_srb_lock = val;
+        else if ( (val = parse_boolean("unpriv-mmio", s, ss)) >= 0 )
+            opt_unpriv_mmio = val;
         else
             rc = -EINVAL;
 
@@ -392,7 +396,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
            opt_srb_lock                              ? " SRB_LOCK+" : " SRB_LOCK-",
            opt_ibpb                                  ? " IBPB"  : "",
            opt_l1d_flush                             ? " L1D_FLUSH" : "",
-           opt_md_clear_pv || opt_md_clear_hvm       ? " VERW"  : "",
+           opt_md_clear_pv || opt_md_clear_hvm ||
+           opt_fb_clear_mmio                         ? " VERW"  : "",
            opt_branch_harden                         ? " BRANCH_HARDEN" : "");
 
     /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */
@@ -942,7 +947,9 @@ void spec_ctrl_init_domain(struct domain *d)
 {
     bool pv = is_pv_domain(d);
 
-    d->arch.verw = pv ? opt_md_clear_pv : opt_md_clear_hvm;
+    d->arch.verw =
+        (pv ? opt_md_clear_pv : opt_md_clear_hvm) ||
+        (opt_fb_clear_mmio && is_iommu_enabled(d));
 }
 
 void __init init_speculation_mitigations(void)
@@ -1197,6 +1204,18 @@ void __init init_speculation_mitigations(void)
     mds_calculations(caps);
 
     /*
+     * Parts which enumerate FB_CLEAR are those which have reintroduced the
+     * VERW flushing side because of a suceptability to FBSDP.
+     *
+     * If unprivileged guests have (or will have) MMIO mappings, we can
+     * mitigate cross-domain leakage of fill buffer data by issuing VERW on
+     * the return-to-guest path.  All privileges of software responsible for
+     * not leaking their own secrets when using MMIO.
+     */
+    if ( opt_unpriv_mmio )
+        opt_fb_clear_mmio = caps & ARCH_CAPS_FB_CLEAR;
+
+    /*
      * By default, enable PV and HVM mitigations on MDS-vulnerable hardware.
      * This will only be a token effort for MLPDS/MFBDS when HT is enabled,
      * but it is somewhat better than nothing.
@@ -1209,8 +1228,8 @@ void __init init_speculation_mitigations(void)
                             boot_cpu_has(X86_FEATURE_MD_CLEAR));
 
     /*
-     * Enable MDS defences as applicable.  The Idle blocks need using if
-     * either PV or HVM defences are used.
+     * Enable MDS/MMIO defences as applicable.  The Idle blocks need using if
+     * either PV or HVM, or if we give MMIO access to untrusted guests.
      *
      * HVM is more complicated.  The MD_CLEAR microcode extends L1D_FLUSH with
      * equivelent semantics to avoid needing to perform both flushes on the
@@ -1220,7 +1239,7 @@ void __init init_speculation_mitigations(void)
      * opt_md_clear_hvm to mean just "should we VERW on the way into HVM
      * guests", so spec_ctrl_init_domain() can calculate suitable settings.
      */
-    if ( opt_md_clear_pv || opt_md_clear_hvm )
+    if ( opt_md_clear_pv || opt_md_clear_hvm || opt_fb_clear_mmio )
         setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE);
     opt_md_clear_hvm &= !(caps & ARCH_CAPS_SKIP_L1DFL) && !opt_l1d_flush;
 
@@ -1285,14 +1304,19 @@ void __init init_speculation_mitigations(void)
      * On some SRBDS-affected hardware, it may be safe to relax srb-lock by
      * default.
      *
-     * On parts which enumerate MDS_NO and not TAA_NO, TSX is the only known
-     * way to access the Fill Buffer.  If TSX isn't available (inc. SKU
-     * reasons on some models), or TSX is explicitly disabled, then there is
-     * no need for the extra overhead to protect RDRAND/RDSEED.
+     * All parts with SRBDS_CTRL suffer SSDP, the mechanism by which stale RNG
+     * data becomes available to other contexts.  To recover the data, an
+     * attaker needs to use:
+     *  - SBDS (MDS or TAA to sample the cores fill buffer)
+     *  - SBDR (Architectrually retrieve stale transaction buffer contents)
+     *  - DRPW (Architectrually latch stale fill buffer data)
+     *
+     * Therefore, on MDS_NO parts, and TAA_NO or TSX unavailable/disabled, and
+     * no unprivileged MMIO access, the RNG data doesn't need protecting.
      */
     if ( cpu_has_srbds_ctrl )
     {
-        if ( opt_srb_lock == -1 &&
+        if ( opt_srb_lock == -1 && !opt_unpriv_mmio &&
              (caps & (ARCH_CAPS_MDS_NO|ARCH_CAPS_TAA_NO)) == ARCH_CAPS_MDS_NO &&
              (!cpu_has_hle || ((caps & ARCH_CAPS_TSX_CTRL) && rtm_disabled)) )
             opt_srb_lock = 0;
