[gentoo-commits] proj/linux-patches:5.4 commit in: /

Mike Pagano Fri, 07 Oct 2022 04:12:19 -0700

commit:     e27f74f7a987777e413fae28ed697b00889a687a
Author:     Mike Pagano <mpagano <AT> gentoo <DOT> org>
AuthorDate: Fri Oct  7 11:11:59 2022 +0000
Commit:     Mike Pagano <mpagano <AT> gentoo <DOT> org>
CommitDate: Fri Oct  7 11:11:59 2022 +0000
URL:        https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=e27f74f7


Linux patch 5.4.217

Signed-off-by: Mike Pagano <mpagano <AT> gentoo.org>

 0000_README              |    4 +
 1216_linux-5.4.217.patch | 3111 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 3115 insertions(+)

diff --git a/0000_README b/0000_README
index 91501850..c7ac01c3 100644
--- a/0000_README
+++ b/0000_README
@@ -907,6 +907,10 @@ Patch:  1215_linux-5.4.216.patch
 From:   http://www.kernel.org
 Desc:   Linux 5.4.216
 
+Patch:  1216_linux-5.4.217.patch
+From:   http://www.kernel.org
+Desc:   Linux 5.4.217
+
 Patch:  1500_XATTR_USER_PREFIX.patch
 From:   https://bugs.gentoo.org/show_bug.cgi?id=470644
 Desc:   Support for namespace user.pax.* on tmpfs.

diff --git a/1216_linux-5.4.217.patch b/1216_linux-5.4.217.patch
new file mode 100644
index 00000000..342e7a14
--- /dev/null
+++ b/1216_linux-5.4.217.patch
@@ -0,0 +1,3111 @@
+diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
+index db9d53b879f89..8f71a17ad5442 100644
+--- a/Documentation/admin-guide/kernel-parameters.txt
++++ b/Documentation/admin-guide/kernel-parameters.txt
+@@ -4298,6 +4298,18 @@
+ 
+       retain_initrd   [RAM] Keep initrd memory after extraction
+ 
++      retbleed=       [X86] Control mitigation of RETBleed (Arbitrary
++                      Speculative Code Execution with Return Instructions)
++                      vulnerability.
++
++                      off         - unconditionally disable
++                      auto        - automatically select a migitation
++
++                      Selecting 'auto' will choose a mitigation method at run
++                      time according to the CPU.
++
++                      Not specifying this option is equivalent to 
retbleed=auto.
++
+       rfkill.default_state=
+               0       "airplane mode".  All wifi, bluetooth, wimax, gps, fm,
+                       etc. communication is blocked by default.
+@@ -4541,6 +4553,7 @@
+                       eibrs             - enhanced IBRS
+                       eibrs,retpoline   - enhanced IBRS + Retpolines
+                       eibrs,lfence      - enhanced IBRS + LFENCE
++                      ibrs              - use IBRS to protect kernel
+ 
+                       Not specifying this option is equivalent to
+                       spectre_v2=auto.
+diff --git a/Documentation/process/code-of-conduct-interpretation.rst 
b/Documentation/process/code-of-conduct-interpretation.rst
+index e899f14a4ba24..4f8a06b00f608 100644
+--- a/Documentation/process/code-of-conduct-interpretation.rst
++++ b/Documentation/process/code-of-conduct-interpretation.rst
+@@ -51,7 +51,7 @@ the Technical Advisory Board (TAB) or other maintainers if 
you're
+ uncertain how to handle situations that come up.  It will not be
+ considered a violation report unless you want it to be.  If you are
+ uncertain about approaching the TAB or any other maintainers, please
+-reach out to our conflict mediator, Mishi Choudhary <mi...@linux.com>.
++reach out to our conflict mediator, Joanna Lee <joanna....@gesmer.com>.
+ 
+ In the end, "be kind to each other" is really what the end goal is for
+ everybody.  We know everyone is human and we all fail at times, but the
+diff --git a/Makefile b/Makefile
+index 3d9d7ef6f8bf1..201ac8e410a94 100644
+--- a/Makefile
++++ b/Makefile
+@@ -1,7 +1,7 @@
+ # SPDX-License-Identifier: GPL-2.0
+ VERSION = 5
+ PATCHLEVEL = 4
+-SUBLEVEL = 216
++SUBLEVEL = 217
+ EXTRAVERSION =
+ NAME = Kleptomaniac Octopus
+ 
+diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
+index b3f1214787386..29e5675c6d4f2 100644
+--- a/arch/x86/entry/calling.h
++++ b/arch/x86/entry/calling.h
+@@ -6,6 +6,8 @@
+ #include <asm/percpu.h>
+ #include <asm/asm-offsets.h>
+ #include <asm/processor-flags.h>
++#include <asm/msr.h>
++#include <asm/nospec-branch.h>
+ 
+ /*
+ 
+@@ -146,27 +148,19 @@ For 32-bit we have the following conventions - kernel is 
built with
+ 
+ .endm
+ 
+-.macro POP_REGS pop_rdi=1 skip_r11rcx=0
++.macro POP_REGS pop_rdi=1
+       popq %r15
+       popq %r14
+       popq %r13
+       popq %r12
+       popq %rbp
+       popq %rbx
+-      .if \skip_r11rcx
+-      popq %rsi
+-      .else
+       popq %r11
+-      .endif
+       popq %r10
+       popq %r9
+       popq %r8
+       popq %rax
+-      .if \skip_r11rcx
+-      popq %rsi
+-      .else
+       popq %rcx
+-      .endif
+       popq %rdx
+       popq %rsi
+       .if \pop_rdi
+@@ -316,6 +310,62 @@ For 32-bit we have the following conventions - kernel is 
built with
+ 
+ #endif
+ 
++/*
++ * IBRS kernel mitigation for Spectre_v2.
++ *
++ * Assumes full context is established (PUSH_REGS, CR3 and GS) and it clobbers
++ * the regs it uses (AX, CX, DX). Must be called before the first RET
++ * instruction (NOTE! UNTRAIN_RET includes a RET instruction)
++ *
++ * The optional argument is used to save/restore the current value,
++ * which is used on the paranoid paths.
++ *
++ * Assumes x86_spec_ctrl_{base,current} to have SPEC_CTRL_IBRS set.
++ */
++.macro IBRS_ENTER save_reg
++      ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS
++      movl    $MSR_IA32_SPEC_CTRL, %ecx
++
++.ifnb \save_reg
++      rdmsr
++      shl     $32, %rdx
++      or      %rdx, %rax
++      mov     %rax, \save_reg
++      test    $SPEC_CTRL_IBRS, %eax
++      jz      .Ldo_wrmsr_\@
++      lfence
++      jmp     .Lend_\@
++.Ldo_wrmsr_\@:
++.endif
++
++      movq    PER_CPU_VAR(x86_spec_ctrl_current), %rdx
++      movl    %edx, %eax
++      shr     $32, %rdx
++      wrmsr
++.Lend_\@:
++.endm
++
++/*
++ * Similar to IBRS_ENTER, requires KERNEL GS,CR3 and clobbers (AX, CX, DX)
++ * regs. Must be called after the last RET.
++ */
++.macro IBRS_EXIT save_reg
++      ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS
++      movl    $MSR_IA32_SPEC_CTRL, %ecx
++
++.ifnb \save_reg
++      mov     \save_reg, %rdx
++.else
++      movq    PER_CPU_VAR(x86_spec_ctrl_current), %rdx
++      andl    $(~SPEC_CTRL_IBRS), %edx
++.endif
++
++      movl    %edx, %eax
++      shr     $32, %rdx
++      wrmsr
++.Lend_\@:
++.endm
++
+ /*
+  * Mitigate Spectre v1 for conditional swapgs code paths.
+  *
+diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
+index bde3e0f85425f..2d837fb54c31b 100644
+--- a/arch/x86/entry/entry_32.S
++++ b/arch/x86/entry/entry_32.S
+@@ -750,7 +750,6 @@ ENTRY(__switch_to_asm)
+       movl    %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
+ #endif
+ 
+-#ifdef CONFIG_RETPOLINE
+       /*
+        * When switching from a shallower to a deeper call stack
+        * the RSB may either underflow or use entries populated
+@@ -759,7 +758,6 @@ ENTRY(__switch_to_asm)
+        * speculative execution to prevent attack.
+        */
+       FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+-#endif
+ 
+       /* restore callee-saved registers */
+       popfl
+diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
+index 2ba3d53ac5b11..c82136030d58f 100644
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -172,6 +172,10 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
+       /* IRQs are off. */
+       movq    %rax, %rdi
+       movq    %rsp, %rsi
++
++      /* clobbers %rax, make sure it is after saving the syscall nr */
++      IBRS_ENTER
++
+       call    do_syscall_64           /* returns with IRQs disabled */
+ 
+       TRACE_IRQS_IRETQ                /* we're about to change IF */
+@@ -248,8 +252,8 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
+        * perf profiles. Nothing jumps here.
+        */
+ syscall_return_via_sysret:
+-      /* rcx and r11 are already restored (see code above) */
+-      POP_REGS pop_rdi=0 skip_r11rcx=1
++      IBRS_EXIT
++      POP_REGS pop_rdi=0
+ 
+       /*
+        * Now all regs are restored except RSP and RDI.
+@@ -301,7 +305,6 @@ ENTRY(__switch_to_asm)
+       movq    %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset
+ #endif
+ 
+-#ifdef CONFIG_RETPOLINE
+       /*
+        * When switching from a shallower to a deeper call stack
+        * the RSB may either underflow or use entries populated
+@@ -310,7 +313,6 @@ ENTRY(__switch_to_asm)
+        * speculative execution to prevent attack.
+        */
+       FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+-#endif
+ 
+       /* restore callee-saved registers */
+       popq    %r15
+@@ -622,6 +624,7 @@ GLOBAL(retint_user)
+       TRACE_IRQS_IRETQ
+ 
+ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
++      IBRS_EXIT
+ #ifdef CONFIG_DEBUG_ENTRY
+       /* Assert that pt_regs indicates user mode. */
+       testb   $3, CS(%rsp)
+@@ -1248,7 +1251,13 @@ ENTRY(paranoid_entry)
+        */
+       FENCE_SWAPGS_KERNEL_ENTRY
+ 
+-      ret
++      /*
++       * Once we have CR3 and %GS setup save and set SPEC_CTRL. Just like
++       * CR3 above, keep the old value in a callee saved register.
++       */
++      IBRS_ENTER save_reg=%r15
++
++      RET
+ END(paranoid_entry)
+ 
+ /*
+@@ -1276,12 +1285,20 @@ ENTRY(paranoid_exit)
+       jmp     .Lparanoid_exit_restore
+ .Lparanoid_exit_no_swapgs:
+       TRACE_IRQS_IRETQ_DEBUG
++
++      /*
++       * Must restore IBRS state before both CR3 and %GS since we need access
++       * to the per-CPU x86_spec_ctrl_shadow variable.
++       */
++      IBRS_EXIT save_reg=%r15
++
+       /* Always restore stashed CR3 value (see paranoid_entry) */
+       RESTORE_CR3     scratch_reg=%rbx save_reg=%r14
+ .Lparanoid_exit_restore:
+       jmp restore_regs_and_return_to_kernel
+ END(paranoid_exit)
+ 
++
+ /*
+  * Save all registers in pt_regs, and switch GS if needed.
+  */
+@@ -1301,6 +1318,7 @@ ENTRY(error_entry)
+       FENCE_SWAPGS_USER_ENTRY
+       /* We have user CR3.  Change to kernel CR3. */
+       SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
++      IBRS_ENTER
+ 
+ .Lerror_entry_from_usermode_after_swapgs:
+       /* Put us onto the real thread stack. */
+@@ -1356,6 +1374,7 @@ ENTRY(error_entry)
+       SWAPGS
+       FENCE_SWAPGS_USER_ENTRY
+       SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
++      IBRS_ENTER
+ 
+       /*
+        * Pretend that the exception came from user mode: set up pt_regs
+@@ -1461,6 +1480,8 @@ ENTRY(nmi)
+       PUSH_AND_CLEAR_REGS rdx=(%rdx)
+       ENCODE_FRAME_POINTER
+ 
++      IBRS_ENTER
++
+       /*
+        * At this point we no longer need to worry about stack damage
+        * due to nesting -- we're on the normal thread stack and we're
+@@ -1684,6 +1705,9 @@ end_repeat_nmi:
+       movq    $-1, %rsi
+       call    do_nmi
+ 
++      /* Always restore stashed SPEC_CTRL value (see paranoid_entry) */
++      IBRS_EXIT save_reg=%r15
++
+       /* Always restore stashed CR3 value (see paranoid_entry) */
+       RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
+ 
+diff --git a/arch/x86/entry/entry_64_compat.S 
b/arch/x86/entry/entry_64_compat.S
+index 39913770a44d5..c3c4ea4a6711a 100644
+--- a/arch/x86/entry/entry_64_compat.S
++++ b/arch/x86/entry/entry_64_compat.S
+@@ -4,7 +4,6 @@
+  *
+  * Copyright 2000-2002 Andi Kleen, SuSE Labs.
+  */
+-#include "calling.h"
+ #include <asm/asm-offsets.h>
+ #include <asm/current.h>
+ #include <asm/errno.h>
+@@ -17,6 +16,8 @@
+ #include <linux/linkage.h>
+ #include <linux/err.h>
+ 
++#include "calling.h"
++
+       .section .entry.text, "ax"
+ 
+ /*
+@@ -106,6 +107,8 @@ ENTRY(entry_SYSENTER_compat)
+       xorl    %r15d, %r15d            /* nospec   r15 */
+       cld
+ 
++      IBRS_ENTER
++
+       /*
+        * SYSENTER doesn't filter flags, so we need to clear NT and AC
+        * ourselves.  To save a few cycles, we can check whether
+@@ -253,6 +256,8 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
+        */
+       TRACE_IRQS_OFF
+ 
++      IBRS_ENTER
++
+       movq    %rsp, %rdi
+       call    do_fast_syscall_32
+       /* XEN PV guests always use IRET path */
+@@ -267,6 +272,9 @@ sysret32_from_system_call:
+        */
+       STACKLEAK_ERASE
+       TRACE_IRQS_ON                   /* User mode traces as IRQs on. */
++
++      IBRS_EXIT
++
+       movq    RBX(%rsp), %rbx         /* pt_regs->rbx */
+       movq    RBP(%rsp), %rbp         /* pt_regs->rbp */
+       movq    EFLAGS(%rsp), %r11      /* pt_regs->flags (in r11) */
+@@ -408,6 +416,7 @@ ENTRY(entry_INT80_compat)
+        * gate turned them off.
+        */
+       TRACE_IRQS_OFF
++      IBRS_ENTER
+ 
+       movq    %rsp, %rdi
+       call    do_int80_syscall_32
+diff --git a/arch/x86/include/asm/cpu_device_id.h 
b/arch/x86/include/asm/cpu_device_id.h
+index 0c814cd9ea42c..cdf39decf7340 100644
+--- a/arch/x86/include/asm/cpu_device_id.h
++++ b/arch/x86/include/asm/cpu_device_id.h
+@@ -5,15 +5,22 @@
+ /*
+  * Declare drivers belonging to specific x86 CPUs
+  * Similar in spirit to pci_device_id and related PCI functions
++ *
++ * The wildcard initializers are in mod_devicetable.h because
++ * file2alias needs them. Sigh.
+  */
+-
+ #include <linux/mod_devicetable.h>
++/* Get the INTEL_FAM* model defines */
++#include <asm/intel-family.h>
++/* And the X86_VENDOR_* ones */
++#include <asm/processor.h>
+ 
++/* Centaur FAM6 models */
++#define X86_CENTAUR_FAM6_C7_A         0xa
+ #define X86_CENTAUR_FAM6_C7_D         0xd
+ #define X86_CENTAUR_FAM6_NANO         0xf
+ 
+ #define X86_STEPPINGS(mins, maxs)    GENMASK(maxs, mins)
+-
+ /**
+  * X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching
+  * @_vendor:  The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
+@@ -26,8 +33,11 @@
+  *            format is unsigned long. The supplied value, pointer
+  *            etc. is casted to unsigned long internally.
+  *
+- * Backport version to keep the SRBDS pile consistant. No shorter variants
+- * required for this.
++ * Use only if you need all selectors. Otherwise use one of the shorter
++ * macros of the X86_MATCH_* family. If there is no matching shorthand
++ * macro, consider to add one. If you really need to wrap one of the macros
++ * into another macro at the usage site for good reasons, then please
++ * start this local macro with X86_MATCH to allow easy grepping.
+  */
+ #define X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(_vendor, _family, 
_model, \
+                                                   _steppings, _feature, 
_data) { \
+@@ -39,6 +49,120 @@
+       .driver_data    = (unsigned long) _data                         \
+ }
+ 
++/**
++ * X86_MATCH_VENDOR_FAM_MODEL_FEATURE - Macro for CPU matching
++ * @_vendor:  The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
++ *            The name is expanded to X86_VENDOR_@_vendor
++ * @_family:  The family number or X86_FAMILY_ANY
++ * @_model:   The model number, model constant or X86_MODEL_ANY
++ * @_feature: A X86_FEATURE bit or X86_FEATURE_ANY
++ * @_data:    Driver specific data or NULL. The internal storage
++ *            format is unsigned long. The supplied value, pointer
++ *            etc. is casted to unsigned long internally.
++ *
++ * The steppings arguments of X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE() 
is
++ * set to wildcards.
++ */
++#define X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, model, feature, 
data) \
++      X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(vendor, family, model, \
++                                              X86_STEPPING_ANY, feature, data)
++
++/**
++ * X86_MATCH_VENDOR_FAM_FEATURE - Macro for matching vendor, family and CPU 
feature
++ * @vendor:   The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
++ *            The name is expanded to X86_VENDOR_@vendor
++ * @family:   The family number or X86_FAMILY_ANY
++ * @feature:  A X86_FEATURE bit
++ * @data:     Driver specific data or NULL. The internal storage
++ *            format is unsigned long. The supplied value, pointer
++ *            etc. is casted to unsigned long internally.
++ *
++ * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are
++ * set to wildcards.
++ */
++#define X86_MATCH_VENDOR_FAM_FEATURE(vendor, family, feature, data)   \
++      X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family,              \
++                                         X86_MODEL_ANY, feature, data)
++
++/**
++ * X86_MATCH_VENDOR_FEATURE - Macro for matching vendor and CPU feature
++ * @vendor:   The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
++ *            The name is expanded to X86_VENDOR_@vendor
++ * @feature:  A X86_FEATURE bit
++ * @data:     Driver specific data or NULL. The internal storage
++ *            format is unsigned long. The supplied value, pointer
++ *            etc. is casted to unsigned long internally.
++ *
++ * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are
++ * set to wildcards.
++ */
++#define X86_MATCH_VENDOR_FEATURE(vendor, feature, data)                       
\
++      X86_MATCH_VENDOR_FAM_FEATURE(vendor, X86_FAMILY_ANY, feature, data)
++
++/**
++ * X86_MATCH_FEATURE - Macro for matching a CPU feature
++ * @feature:  A X86_FEATURE bit
++ * @data:     Driver specific data or NULL. The internal storage
++ *            format is unsigned long. The supplied value, pointer
++ *            etc. is casted to unsigned long internally.
++ *
++ * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are
++ * set to wildcards.
++ */
++#define X86_MATCH_FEATURE(feature, data)                              \
++      X86_MATCH_VENDOR_FEATURE(ANY, feature, data)
++
++/* Transitional to keep the existing code working */
++#define X86_FEATURE_MATCH(feature)    X86_MATCH_FEATURE(feature, NULL)
++
++/**
++ * X86_MATCH_VENDOR_FAM_MODEL - Match vendor, family and model
++ * @vendor:   The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
++ *            The name is expanded to X86_VENDOR_@vendor
++ * @family:   The family number or X86_FAMILY_ANY
++ * @model:    The model number, model constant or X86_MODEL_ANY
++ * @data:     Driver specific data or NULL. The internal storage
++ *            format is unsigned long. The supplied value, pointer
++ *            etc. is casted to unsigned long internally.
++ *
++ * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are
++ * set to wildcards.
++ */
++#define X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, data)               
\
++      X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, model,       \
++                                         X86_FEATURE_ANY, data)
++
++/**
++ * X86_MATCH_VENDOR_FAM - Match vendor and family
++ * @vendor:   The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
++ *            The name is expanded to X86_VENDOR_@vendor
++ * @family:   The family number or X86_FAMILY_ANY
++ * @data:     Driver specific data or NULL. The internal storage
++ *            format is unsigned long. The supplied value, pointer
++ *            etc. is casted to unsigned long internally.
++ *
++ * All other missing arguments to X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are
++ * set of wildcards.
++ */
++#define X86_MATCH_VENDOR_FAM(vendor, family, data)                    \
++      X86_MATCH_VENDOR_FAM_MODEL(vendor, family, X86_MODEL_ANY, data)
++
++/**
++ * X86_MATCH_INTEL_FAM6_MODEL - Match vendor INTEL, family 6 and model
++ * @model:    The model name without the INTEL_FAM6_ prefix or ANY
++ *            The model name is expanded to INTEL_FAM6_@model internally
++ * @data:     Driver specific data or NULL. The internal storage
++ *            format is unsigned long. The supplied value, pointer
++ *            etc. is casted to unsigned long internally.
++ *
++ * The vendor is set to INTEL, the family to 6 and all other missing
++ * arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are set to wildcards.
++ *
++ * See X86_MATCH_VENDOR_FAM_MODEL_FEATURE() for further information.
++ */
++#define X86_MATCH_INTEL_FAM6_MODEL(model, data)                               
\
++      X86_MATCH_VENDOR_FAM_MODEL(INTEL, 6, INTEL_FAM6_##model, data)
++
+ /*
+  * Match specific microcode revisions.
+  *
+diff --git a/arch/x86/include/asm/cpufeatures.h 
b/arch/x86/include/asm/cpufeatures.h
+index 736b0e412344b..2ec85d7bfdff2 100644
+--- a/arch/x86/include/asm/cpufeatures.h
++++ b/arch/x86/include/asm/cpufeatures.h
+@@ -203,8 +203,8 @@
+ #define X86_FEATURE_PROC_FEEDBACK     ( 7*32+ 9) /* AMD ProcFeedbackInterface 
*/
+ #define X86_FEATURE_SME                       ( 7*32+10) /* AMD Secure Memory 
Encryption */
+ #define X86_FEATURE_PTI                       ( 7*32+11) /* Kernel Page Table 
Isolation enabled */
+-#define X86_FEATURE_RETPOLINE         ( 7*32+12) /* "" Generic Retpoline 
mitigation for Spectre variant 2 */
+-#define X86_FEATURE_RETPOLINE_LFENCE  ( 7*32+13) /* "" Use LFENCE for Spectre 
variant 2 */
++#define X86_FEATURE_KERNEL_IBRS               ( 7*32+12) /* "" Set/clear IBRS 
on kernel entry/exit */
++#define X86_FEATURE_RSB_VMEXIT                ( 7*32+13) /* "" Fill RSB on 
VM-Exit */
+ #define X86_FEATURE_INTEL_PPIN                ( 7*32+14) /* Intel Processor 
Inventory Number */
+ #define X86_FEATURE_CDP_L2            ( 7*32+15) /* Code and Data 
Prioritization L2 */
+ #define X86_FEATURE_MSR_SPEC_CTRL     ( 7*32+16) /* "" MSR SPEC_CTRL is 
implemented */
+@@ -286,7 +286,10 @@
+ #define X86_FEATURE_CQM_MBM_LOCAL     (11*32+ 3) /* LLC Local MBM monitoring 
*/
+ #define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry 
SWAPGS path */
+ #define X86_FEATURE_FENCE_SWAPGS_KERNEL       (11*32+ 5) /* "" LFENCE in 
kernel entry SWAPGS path */
+-#define X86_FEATURE_RSB_VMEXIT_LITE   (11*32+ 6) /* "" Fill RSB on VM exit 
when EIBRS is enabled */
++#define X86_FEATURE_RRSBA_CTRL                (11*32+11) /* "" RET prediction 
control */
++#define X86_FEATURE_RETPOLINE         (11*32+12) /* "" Generic Retpoline 
mitigation for Spectre variant 2 */
++#define X86_FEATURE_RETPOLINE_LFENCE  (11*32+13) /* "" Use LFENCE for Spectre 
variant 2 */
++#define X86_FEATURE_RSB_VMEXIT_LITE   (11*32+17) /* "" Fill RSB on VM exit 
when EIBRS is enabled */
+ 
+ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
+ #define X86_FEATURE_AVX512_BF16               (12*32+ 5) /* AVX512 BFLOAT16 
instructions */
+@@ -303,6 +306,7 @@
+ #define X86_FEATURE_AMD_SSBD          (13*32+24) /* "" Speculative Store 
Bypass Disable */
+ #define X86_FEATURE_VIRT_SSBD         (13*32+25) /* Virtualized Speculative 
Store Bypass Disable */
+ #define X86_FEATURE_AMD_SSB_NO                (13*32+26) /* "" Speculative 
Store Bypass is fixed in hardware. */
++#define X86_FEATURE_BTC_NO            (13*32+29) /* "" Not vulnerable to 
Branch Type Confusion */
+ 
+ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
+ #define X86_FEATURE_DTHERM            (14*32+ 0) /* Digital Thermal Sensor */
+@@ -407,7 +411,8 @@
+ #define X86_BUG_ITLB_MULTIHIT         X86_BUG(23) /* CPU may incur MCE during 
certain page attribute changes */
+ #define X86_BUG_SRBDS                 X86_BUG(24) /* CPU may leak RNG bits if 
not mitigated */
+ #define X86_BUG_MMIO_STALE_DATA               X86_BUG(25) /* CPU is affected 
by Processor MMIO Stale Data vulnerabilities */
+-#define X86_BUG_MMIO_UNKNOWN          X86_BUG(26) /* CPU is too old and its 
MMIO Stale Data status is unknown */
++#define X86_BUG_RETBLEED              X86_BUG(26) /* CPU is affected by 
RETBleed */
+ #define X86_BUG_EIBRS_PBRSB           X86_BUG(27) /* EIBRS is vulnerable to 
Post Barrier RSB Predictions */
++#define X86_BUG_MMIO_UNKNOWN          X86_BUG(28) /* CPU is too old and its 
MMIO Stale Data status is unknown */
+ 
+ #endif /* _ASM_X86_CPUFEATURES_H */
+diff --git a/arch/x86/include/asm/intel-family.h 
b/arch/x86/include/asm/intel-family.h
+index 5b07573c3bc87..c1d6d8bbb7dad 100644
+--- a/arch/x86/include/asm/intel-family.h
++++ b/arch/x86/include/asm/intel-family.h
+@@ -35,6 +35,9 @@
+  * The #define line may optionally include a comment including platform names.
+  */
+ 
++/* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */
++#define INTEL_FAM6_ANY                        X86_MODEL_ANY
++
+ #define INTEL_FAM6_CORE_YONAH         0x0E
+ 
+ #define INTEL_FAM6_CORE2_MEROM                0x0F
+@@ -126,6 +129,9 @@
+ #define INTEL_FAM6_XEON_PHI_KNL               0x57 /* Knights Landing */
+ #define INTEL_FAM6_XEON_PHI_KNM               0x85 /* Knights Mill */
+ 
++/* Family 5 */
++#define INTEL_FAM5_QUARK_X1000                0x09 /* Quark X1000 SoC */
++
+ /* Useful macros */
+ #define INTEL_CPU_FAM_ANY(_family, _model, _driver_data)      \
+ {                                                             \
+diff --git a/arch/x86/include/asm/msr-index.h 
b/arch/x86/include/asm/msr-index.h
+index cef4eba03ff36..713886d5493a8 100644
+--- a/arch/x86/include/asm/msr-index.h
++++ b/arch/x86/include/asm/msr-index.h
+@@ -47,6 +47,8 @@
+ #define SPEC_CTRL_STIBP                       BIT(SPEC_CTRL_STIBP_SHIFT)      
/* STIBP mask */
+ #define SPEC_CTRL_SSBD_SHIFT          2          /* Speculative Store Bypass 
Disable bit */
+ #define SPEC_CTRL_SSBD                        BIT(SPEC_CTRL_SSBD_SHIFT)       
/* Speculative Store Bypass Disable */
++#define SPEC_CTRL_RRSBA_DIS_S_SHIFT   6          /* Disable RRSBA behavior */
++#define SPEC_CTRL_RRSBA_DIS_S         BIT(SPEC_CTRL_RRSBA_DIS_S_SHIFT)
+ 
+ #define MSR_IA32_PRED_CMD             0x00000049 /* Prediction Command */
+ #define PRED_CMD_IBPB                 BIT(0)     /* Indirect Branch 
Prediction Barrier */
+@@ -82,6 +84,7 @@
+ #define MSR_IA32_ARCH_CAPABILITIES    0x0000010a
+ #define ARCH_CAP_RDCL_NO              BIT(0)  /* Not susceptible to Meltdown 
*/
+ #define ARCH_CAP_IBRS_ALL             BIT(1)  /* Enhanced IBRS support */
++#define ARCH_CAP_RSBA                 BIT(2)  /* RET may use alternative 
branch predictors */
+ #define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH        BIT(3)  /* Skip L1D flush on 
vmentry */
+ #define ARCH_CAP_SSB_NO                       BIT(4)  /*
+                                                * Not susceptible to 
Speculative Store Bypass
+@@ -129,6 +132,13 @@
+                                                * bit available to control VERW
+                                                * behavior.
+                                                */
++#define ARCH_CAP_RRSBA                        BIT(19) /*
++                                               * Indicates RET may use 
predictors
++                                               * other than the RSB. With 
eIBRS
++                                               * enabled predictions in 
kernel mode
++                                               * are restricted to targets in
++                                               * kernel.
++                                               */
+ #define ARCH_CAP_PBRSB_NO             BIT(24) /*
+                                                * Not susceptible to 
Post-Barrier
+                                                * Return Stack Buffer 
Predictions.
+diff --git a/arch/x86/include/asm/nospec-branch.h 
b/arch/x86/include/asm/nospec-branch.h
+index a1ee1a760c3eb..8c898eed28941 100644
+--- a/arch/x86/include/asm/nospec-branch.h
++++ b/arch/x86/include/asm/nospec-branch.h
+@@ -4,11 +4,14 @@
+ #define _ASM_X86_NOSPEC_BRANCH_H_
+ 
+ #include <linux/static_key.h>
++#include <linux/frame.h>
+ 
+ #include <asm/alternative.h>
+ #include <asm/alternative-asm.h>
+ #include <asm/cpufeatures.h>
+ #include <asm/msr-index.h>
++#include <asm/unwind_hints.h>
++#include <asm/percpu.h>
+ 
+ /*
+  * This should be used immediately before a retpoline alternative. It tells
+@@ -60,9 +63,9 @@
+       lfence;                                 \
+       jmp     775b;                           \
+ 774:                                          \
++      add     $(BITS_PER_LONG/8) * 2, sp;     \
+       dec     reg;                            \
+       jnz     771b;                           \
+-      add     $(BITS_PER_LONG/8) * nr, sp;    \
+       /* barrier for jnz misprediction */     \
+       lfence;
+ #else
+@@ -79,13 +82,6 @@
+       add     $(BITS_PER_LONG/8) * nr, sp;
+ #endif
+ 
+-#define __ISSUE_UNBALANCED_RET_GUARD(sp)      \
+-      call    881f;                           \
+-      int3;                                   \
+-881:                                          \
+-      add     $(BITS_PER_LONG/8), sp;         \
+-      lfence;
+-
+ #ifdef __ASSEMBLY__
+ 
+ /*
+@@ -155,26 +151,28 @@
+ #endif
+ .endm
+ 
+-.macro ISSUE_UNBALANCED_RET_GUARD ftr:req
+-      ANNOTATE_NOSPEC_ALTERNATIVE
+-      ALTERNATIVE "jmp .Lskip_pbrsb_\@",                              \
+-              __stringify(__ISSUE_UNBALANCED_RET_GUARD(%_ASM_SP))     \
+-              \ftr
+-.Lskip_pbrsb_\@:
++.macro ISSUE_UNBALANCED_RET_GUARD
++      call .Lunbalanced_ret_guard_\@
++      int3
++.Lunbalanced_ret_guard_\@:
++      add $(BITS_PER_LONG/8), %_ASM_SP
++      lfence
+ .endm
+ 
+  /*
+   * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
+   * monstrosity above, manually.
+   */
+-.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req
+-#ifdef CONFIG_RETPOLINE
+-      ANNOTATE_NOSPEC_ALTERNATIVE
+-      ALTERNATIVE "jmp .Lskip_rsb_\@",                                \
+-              __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP))    \
+-              \ftr
++.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2
++.ifb \ftr2
++      ALTERNATIVE "jmp .Lskip_rsb_\@", "", \ftr
++.else
++      ALTERNATIVE_2 "jmp .Lskip_rsb_\@", "", \ftr, "jmp .Lunbalanced_\@", 
\ftr2
++.endif
++      __FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)
++.Lunbalanced_\@:
++      ISSUE_UNBALANCED_RET_GUARD
+ .Lskip_rsb_\@:
+-#endif
+ .endm
+ 
+ #else /* __ASSEMBLY__ */
+@@ -249,6 +247,7 @@ enum spectre_v2_mitigation {
+       SPECTRE_V2_EIBRS,
+       SPECTRE_V2_EIBRS_RETPOLINE,
+       SPECTRE_V2_EIBRS_LFENCE,
++      SPECTRE_V2_IBRS,
+ };
+ 
+ /* The indirect branch speculation control variants */
+@@ -312,6 +311,9 @@ static inline void indirect_branch_prediction_barrier(void)
+ 
+ /* The Intel SPEC CTRL MSR base value cache */
+ extern u64 x86_spec_ctrl_base;
++DECLARE_PER_CPU(u64, x86_spec_ctrl_current);
++extern void write_spec_ctrl_current(u64 val, bool force);
++extern u64 spec_ctrl_current(void);
+ 
+ /*
+  * With retpoline, we must use IBRS to restrict branch prediction
+@@ -321,18 +323,16 @@ extern u64 x86_spec_ctrl_base;
+  */
+ #define firmware_restrict_branch_speculation_start()                  \
+ do {                                                                  \
+-      u64 val = x86_spec_ctrl_base | SPEC_CTRL_IBRS;                  \
+-                                                                      \
+       preempt_disable();                                              \
+-      alternative_msr_write(MSR_IA32_SPEC_CTRL, val,                  \
++      alternative_msr_write(MSR_IA32_SPEC_CTRL,                       \
++                            spec_ctrl_current() | SPEC_CTRL_IBRS,     \
+                             X86_FEATURE_USE_IBRS_FW);                 \
+ } while (0)
+ 
+ #define firmware_restrict_branch_speculation_end()                    \
+ do {                                                                  \
+-      u64 val = x86_spec_ctrl_base;                                   \
+-                                                                      \
+-      alternative_msr_write(MSR_IA32_SPEC_CTRL, val,                  \
++      alternative_msr_write(MSR_IA32_SPEC_CTRL,                       \
++                            spec_ctrl_current(),                      \
+                             X86_FEATURE_USE_IBRS_FW);                 \
+       preempt_enable();                                               \
+ } while (0)
+diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
+index 88cef978380bf..5571b28d35b60 100644
+--- a/arch/x86/kernel/cpu/amd.c
++++ b/arch/x86/kernel/cpu/amd.c
+@@ -894,12 +894,21 @@ static void init_amd_zn(struct cpuinfo_x86 *c)
+       node_reclaim_distance = 32;
+ #endif
+ 
+-      /*
+-       * Fix erratum 1076: CPB feature bit not being set in CPUID.
+-       * Always set it, except when running under a hypervisor.
+-       */
+-      if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_CPB))
+-              set_cpu_cap(c, X86_FEATURE_CPB);
++      /* Fix up CPUID bits, but only if not virtualised. */
++      if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
++
++              /* Erratum 1076: CPB feature bit not being set in CPUID. */
++              if (!cpu_has(c, X86_FEATURE_CPB))
++                      set_cpu_cap(c, X86_FEATURE_CPB);
++
++              /*
++               * Zen3 (Fam19 model < 0x10) parts are not susceptible to
++               * Branch Type Confusion, but predate the allocation of the
++               * BTC_NO bit.
++               */
++              if (c->x86 == 0x19 && !cpu_has(c, X86_FEATURE_BTC_NO))
++                      set_cpu_cap(c, X86_FEATURE_BTC_NO);
++      }
+ }
+ 
+ static void init_amd(struct cpuinfo_x86 *c)
+diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
+index c90d91cb14341..cf5a18e261e36 100644
+--- a/arch/x86/kernel/cpu/bugs.c
++++ b/arch/x86/kernel/cpu/bugs.c
+@@ -37,6 +37,8 @@
+ 
+ static void __init spectre_v1_select_mitigation(void);
+ static void __init spectre_v2_select_mitigation(void);
++static void __init retbleed_select_mitigation(void);
++static void __init spectre_v2_user_select_mitigation(void);
+ static void __init ssb_select_mitigation(void);
+ static void __init l1tf_select_mitigation(void);
+ static void __init mds_select_mitigation(void);
+@@ -46,16 +48,40 @@ static void __init taa_select_mitigation(void);
+ static void __init mmio_select_mitigation(void);
+ static void __init srbds_select_mitigation(void);
+ 
+-/* The base value of the SPEC_CTRL MSR that always has to be preserved. */
++/* The base value of the SPEC_CTRL MSR without task-specific bits set */
+ u64 x86_spec_ctrl_base;
+ EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
++
++/* The current value of the SPEC_CTRL MSR with task-specific bits set */
++DEFINE_PER_CPU(u64, x86_spec_ctrl_current);
++EXPORT_SYMBOL_GPL(x86_spec_ctrl_current);
++
+ static DEFINE_MUTEX(spec_ctrl_mutex);
+ 
+ /*
+- * The vendor and possibly platform specific bits which can be modified in
+- * x86_spec_ctrl_base.
++ * Keep track of the SPEC_CTRL MSR value for the current task, which may 
differ
++ * from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update().
+  */
+-static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS;
++void write_spec_ctrl_current(u64 val, bool force)
++{
++      if (this_cpu_read(x86_spec_ctrl_current) == val)
++              return;
++
++      this_cpu_write(x86_spec_ctrl_current, val);
++
++      /*
++       * When KERNEL_IBRS this MSR is written on return-to-user, unless
++       * forced the update can be delayed until that time.
++       */
++      if (force || !cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
++              wrmsrl(MSR_IA32_SPEC_CTRL, val);
++}
++
++u64 spec_ctrl_current(void)
++{
++      return this_cpu_read(x86_spec_ctrl_current);
++}
++EXPORT_SYMBOL_GPL(spec_ctrl_current);
+ 
+ /*
+  * AMD specific MSR info for Speculative Store Bypass control.
+@@ -105,13 +131,21 @@ void __init check_bugs(void)
+       if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
+               rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+ 
+-      /* Allow STIBP in MSR_SPEC_CTRL if supported */
+-      if (boot_cpu_has(X86_FEATURE_STIBP))
+-              x86_spec_ctrl_mask |= SPEC_CTRL_STIBP;
+-
+       /* Select the proper CPU mitigations before patching alternatives: */
+       spectre_v1_select_mitigation();
+       spectre_v2_select_mitigation();
++      /*
++       * retbleed_select_mitigation() relies on the state set by
++       * spectre_v2_select_mitigation(); specifically it wants to know about
++       * spectre_v2=ibrs.
++       */
++      retbleed_select_mitigation();
++      /*
++       * spectre_v2_user_select_mitigation() relies on the state set by
++       * retbleed_select_mitigation(); specifically the STIBP selection is
++       * forced for UNRET.
++       */
++      spectre_v2_user_select_mitigation();
+       ssb_select_mitigation();
+       l1tf_select_mitigation();
+       md_clear_select_mitigation();
+@@ -151,31 +185,17 @@ void __init check_bugs(void)
+ #endif
+ }
+ 
++/*
++ * NOTE: For VMX, this function is not called in the vmexit path.
++ * It uses vmx_spec_ctrl_restore_host() instead.
++ */
+ void
+ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool 
setguest)
+ {
+-      u64 msrval, guestval, hostval = x86_spec_ctrl_base;
++      u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current();
+       struct thread_info *ti = current_thread_info();
+ 
+-      /* Is MSR_SPEC_CTRL implemented ? */
+       if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) {
+-              /*
+-               * Restrict guest_spec_ctrl to supported values. Clear the
+-               * modifiable bits in the host base value and or the
+-               * modifiable bits from the guest value.
+-               */
+-              guestval = hostval & ~x86_spec_ctrl_mask;
+-              guestval |= guest_spec_ctrl & x86_spec_ctrl_mask;
+-
+-              /* SSBD controlled in MSR_SPEC_CTRL */
+-              if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+-                  static_cpu_has(X86_FEATURE_AMD_SSBD))
+-                      hostval |= ssbd_tif_to_spec_ctrl(ti->flags);
+-
+-              /* Conditional STIBP enabled? */
+-              if (static_branch_unlikely(&switch_to_cond_stibp))
+-                      hostval |= stibp_tif_to_spec_ctrl(ti->flags);
+-
+               if (hostval != guestval) {
+                       msrval = setguest ? guestval : hostval;
+                       wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
+@@ -705,12 +725,103 @@ static int __init nospectre_v1_cmdline(char *str)
+ }
+ early_param("nospectre_v1", nospectre_v1_cmdline);
+ 
+-#undef pr_fmt
+-#define pr_fmt(fmt)     "Spectre V2 : " fmt
+-
+ static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
+       SPECTRE_V2_NONE;
+ 
++#undef pr_fmt
++#define pr_fmt(fmt)     "RETBleed: " fmt
++
++enum retbleed_mitigation {
++      RETBLEED_MITIGATION_NONE,
++      RETBLEED_MITIGATION_IBRS,
++      RETBLEED_MITIGATION_EIBRS,
++};
++
++enum retbleed_mitigation_cmd {
++      RETBLEED_CMD_OFF,
++      RETBLEED_CMD_AUTO,
++};
++
++const char * const retbleed_strings[] = {
++      [RETBLEED_MITIGATION_NONE]      = "Vulnerable",
++      [RETBLEED_MITIGATION_IBRS]      = "Mitigation: IBRS",
++      [RETBLEED_MITIGATION_EIBRS]     = "Mitigation: Enhanced IBRS",
++};
++
++static enum retbleed_mitigation retbleed_mitigation __ro_after_init =
++      RETBLEED_MITIGATION_NONE;
++static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init =
++      RETBLEED_CMD_AUTO;
++
++static int __init retbleed_parse_cmdline(char *str)
++{
++      if (!str)
++              return -EINVAL;
++
++      if (!strcmp(str, "off"))
++              retbleed_cmd = RETBLEED_CMD_OFF;
++      else if (!strcmp(str, "auto"))
++              retbleed_cmd = RETBLEED_CMD_AUTO;
++      else
++              pr_err("Unknown retbleed option (%s). Defaulting to 'auto'\n", 
str);
++
++      return 0;
++}
++early_param("retbleed", retbleed_parse_cmdline);
++
++#define RETBLEED_UNTRAIN_MSG "WARNING: BTB untrained return thunk mitigation 
is only effective on AMD/Hygon!\n"
++#define RETBLEED_COMPILER_MSG "WARNING: kernel not compiled with RETPOLINE or 
-mfunction-return capable compiler!\n"
++#define RETBLEED_INTEL_MSG "WARNING: Spectre v2 mitigation leaves CPU 
vulnerable to RETBleed attacks, data leaks possible!\n"
++
++static void __init retbleed_select_mitigation(void)
++{
++      if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off())
++              return;
++
++      switch (retbleed_cmd) {
++      case RETBLEED_CMD_OFF:
++              return;
++
++      case RETBLEED_CMD_AUTO:
++      default:
++              /*
++               * The Intel mitigation (IBRS) was already selected in
++               * spectre_v2_select_mitigation().
++               */
++
++              break;
++      }
++
++      switch (retbleed_mitigation) {
++      default:
++              break;
++      }
++
++      /*
++       * Let IBRS trump all on Intel without affecting the effects of the
++       * retbleed= cmdline option.
++       */
++      if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
++              switch (spectre_v2_enabled) {
++              case SPECTRE_V2_IBRS:
++                      retbleed_mitigation = RETBLEED_MITIGATION_IBRS;
++                      break;
++              case SPECTRE_V2_EIBRS:
++              case SPECTRE_V2_EIBRS_RETPOLINE:
++              case SPECTRE_V2_EIBRS_LFENCE:
++                      retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
++                      break;
++              default:
++                      pr_err(RETBLEED_INTEL_MSG);
++              }
++      }
++
++      pr_info("%s\n", retbleed_strings[retbleed_mitigation]);
++}
++
++#undef pr_fmt
++#define pr_fmt(fmt)     "Spectre V2 : " fmt
++
+ static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init =
+       SPECTRE_V2_USER_NONE;
+ static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init =
+@@ -740,6 +851,7 @@ static inline const char *spectre_v2_module_string(void) { 
return ""; }
+ #define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended 
for this CPU, data leaks possible!\n"
+ #define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with 
eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n"
+ #define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is 
enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre 
v2 BHB attacks!\n"
++#define SPECTRE_V2_IBRS_PERF_MSG "WARNING: IBRS mitigation selected on 
Enhanced IBRS CPU, this may cause unnecessary performance loss\n"
+ 
+ #ifdef CONFIG_BPF_SYSCALL
+ void unpriv_ebpf_notify(int new_state)
+@@ -781,6 +893,7 @@ enum spectre_v2_mitigation_cmd {
+       SPECTRE_V2_CMD_EIBRS,
+       SPECTRE_V2_CMD_EIBRS_RETPOLINE,
+       SPECTRE_V2_CMD_EIBRS_LFENCE,
++      SPECTRE_V2_CMD_IBRS,
+ };
+ 
+ enum spectre_v2_user_cmd {
+@@ -821,13 +934,15 @@ static void __init spec_v2_user_print_cond(const char 
*reason, bool secure)
+               pr_info("spectre_v2_user=%s forced on command line.\n", reason);
+ }
+ 
++static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd;
++
+ static enum spectre_v2_user_cmd __init
+-spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd)
++spectre_v2_parse_user_cmdline(void)
+ {
+       char arg[20];
+       int ret, i;
+ 
+-      switch (v2_cmd) {
++      switch (spectre_v2_cmd) {
+       case SPECTRE_V2_CMD_NONE:
+               return SPECTRE_V2_USER_CMD_NONE;
+       case SPECTRE_V2_CMD_FORCE:
+@@ -853,15 +968,16 @@ spectre_v2_parse_user_cmdline(enum 
spectre_v2_mitigation_cmd v2_cmd)
+       return SPECTRE_V2_USER_CMD_AUTO;
+ }
+ 
+-static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode)
++static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode)
+ {
+-      return (mode == SPECTRE_V2_EIBRS ||
+-              mode == SPECTRE_V2_EIBRS_RETPOLINE ||
+-              mode == SPECTRE_V2_EIBRS_LFENCE);
++      return mode == SPECTRE_V2_IBRS ||
++             mode == SPECTRE_V2_EIBRS ||
++             mode == SPECTRE_V2_EIBRS_RETPOLINE ||
++             mode == SPECTRE_V2_EIBRS_LFENCE;
+ }
+ 
+ static void __init
+-spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
++spectre_v2_user_select_mitigation(void)
+ {
+       enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE;
+       bool smt_possible = IS_ENABLED(CONFIG_SMP);
+@@ -874,7 +990,7 @@ spectre_v2_user_select_mitigation(enum 
spectre_v2_mitigation_cmd v2_cmd)
+           cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
+               smt_possible = false;
+ 
+-      cmd = spectre_v2_parse_user_cmdline(v2_cmd);
++      cmd = spectre_v2_parse_user_cmdline();
+       switch (cmd) {
+       case SPECTRE_V2_USER_CMD_NONE:
+               goto set_mode;
+@@ -922,12 +1038,12 @@ spectre_v2_user_select_mitigation(enum 
spectre_v2_mitigation_cmd v2_cmd)
+       }
+ 
+       /*
+-       * If no STIBP, enhanced IBRS is enabled or SMT impossible, STIBP is not
+-       * required.
++       * If no STIBP, IBRS or enhanced IBRS is enabled, or SMT impossible,
++       * STIBP is not required.
+        */
+       if (!boot_cpu_has(X86_FEATURE_STIBP) ||
+           !smt_possible ||
+-          spectre_v2_in_eibrs_mode(spectre_v2_enabled))
++          spectre_v2_in_ibrs_mode(spectre_v2_enabled))
+               return;
+ 
+       /*
+@@ -952,6 +1068,7 @@ static const char * const spectre_v2_strings[] = {
+       [SPECTRE_V2_EIBRS]                      = "Mitigation: Enhanced IBRS",
+       [SPECTRE_V2_EIBRS_LFENCE]               = "Mitigation: Enhanced IBRS + 
LFENCE",
+       [SPECTRE_V2_EIBRS_RETPOLINE]            = "Mitigation: Enhanced IBRS + 
Retpolines",
++      [SPECTRE_V2_IBRS]                       = "Mitigation: IBRS",
+ };
+ 
+ static const struct {
+@@ -969,6 +1086,7 @@ static const struct {
+       { "eibrs,lfence",       SPECTRE_V2_CMD_EIBRS_LFENCE,      false },
+       { "eibrs,retpoline",    SPECTRE_V2_CMD_EIBRS_RETPOLINE,   false },
+       { "auto",               SPECTRE_V2_CMD_AUTO,              false },
++      { "ibrs",               SPECTRE_V2_CMD_IBRS,              false },
+ };
+ 
+ static void __init spec_v2_print_cond(const char *reason, bool secure)
+@@ -1031,6 +1149,24 @@ static enum spectre_v2_mitigation_cmd __init 
spectre_v2_parse_cmdline(void)
+               return SPECTRE_V2_CMD_AUTO;
+       }
+ 
++      if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != 
X86_VENDOR_INTEL) {
++              pr_err("%s selected but not Intel CPU. Switching to AUTO 
select\n",
++                     mitigation_options[i].option);
++              return SPECTRE_V2_CMD_AUTO;
++      }
++
++      if (cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) {
++              pr_err("%s selected but CPU doesn't have IBRS. Switching to 
AUTO select\n",
++                     mitigation_options[i].option);
++              return SPECTRE_V2_CMD_AUTO;
++      }
++
++      if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_has(X86_FEATURE_XENPV)) {
++              pr_err("%s selected but running as XenPV guest. Switching to 
AUTO select\n",
++                     mitigation_options[i].option);
++              return SPECTRE_V2_CMD_AUTO;
++      }
++
+       spec_v2_print_cond(mitigation_options[i].option,
+                          mitigation_options[i].secure);
+       return cmd;
+@@ -1046,6 +1182,22 @@ static enum spectre_v2_mitigation __init 
spectre_v2_select_retpoline(void)
+       return SPECTRE_V2_RETPOLINE;
+ }
+ 
++/* Disable in-kernel use of non-RSB RET predictors */
++static void __init spec_ctrl_disable_kernel_rrsba(void)
++{
++      u64 ia32_cap;
++
++      if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL))
++              return;
++
++      ia32_cap = x86_read_arch_cap_msr();
++
++      if (ia32_cap & ARCH_CAP_RRSBA) {
++              x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S;
++              write_spec_ctrl_current(x86_spec_ctrl_base, true);
++      }
++}
++
+ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum 
spectre_v2_mitigation mode)
+ {
+       /*
+@@ -1070,10 +1222,6 @@ static void __init 
spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_
+        */
+       switch (mode) {
+       case SPECTRE_V2_NONE:
+-      /* These modes already fill RSB at vmexit */
+-      case SPECTRE_V2_LFENCE:
+-      case SPECTRE_V2_RETPOLINE:
+-      case SPECTRE_V2_EIBRS_RETPOLINE:
+               return;
+ 
+       case SPECTRE_V2_EIBRS_LFENCE:
+@@ -1083,6 +1231,14 @@ static void __init 
spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_
+                       pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL 
on VMEXIT\n");
+               }
+               return;
++
++      case SPECTRE_V2_EIBRS_RETPOLINE:
++      case SPECTRE_V2_RETPOLINE:
++      case SPECTRE_V2_LFENCE:
++      case SPECTRE_V2_IBRS:
++              setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT);
++              pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n");
++              return;
+       }
+ 
+       pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM 
exit");
+@@ -1113,6 +1269,14 @@ static void __init spectre_v2_select_mitigation(void)
+                       break;
+               }
+ 
++              if (boot_cpu_has_bug(X86_BUG_RETBLEED) &&
++                  retbleed_cmd != RETBLEED_CMD_OFF &&
++                  boot_cpu_has(X86_FEATURE_IBRS) &&
++                  boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
++                      mode = SPECTRE_V2_IBRS;
++                      break;
++              }
++
+               mode = spectre_v2_select_retpoline();
+               break;
+ 
+@@ -1129,6 +1293,10 @@ static void __init spectre_v2_select_mitigation(void)
+               mode = spectre_v2_select_retpoline();
+               break;
+ 
++      case SPECTRE_V2_CMD_IBRS:
++              mode = SPECTRE_V2_IBRS;
++              break;
++
+       case SPECTRE_V2_CMD_EIBRS:
+               mode = SPECTRE_V2_EIBRS;
+               break;
+@@ -1145,10 +1313,9 @@ static void __init spectre_v2_select_mitigation(void)
+       if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+               pr_err(SPECTRE_V2_EIBRS_EBPF_MSG);
+ 
+-      if (spectre_v2_in_eibrs_mode(mode)) {
+-              /* Force it so VMEXIT will restore correctly */
++      if (spectre_v2_in_ibrs_mode(mode)) {
+               x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
+-              wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
++              write_spec_ctrl_current(x86_spec_ctrl_base, true);
+       }
+ 
+       switch (mode) {
+@@ -1156,6 +1323,12 @@ static void __init spectre_v2_select_mitigation(void)
+       case SPECTRE_V2_EIBRS:
+               break;
+ 
++      case SPECTRE_V2_IBRS:
++              setup_force_cpu_cap(X86_FEATURE_KERNEL_IBRS);
++              if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED))
++                      pr_warn(SPECTRE_V2_IBRS_PERF_MSG);
++              break;
++
+       case SPECTRE_V2_LFENCE:
+       case SPECTRE_V2_EIBRS_LFENCE:
+               setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE);
+@@ -1167,16 +1340,56 @@ static void __init spectre_v2_select_mitigation(void)
+               break;
+       }
+ 
++      /*
++       * Disable alternate RSB predictions in kernel when indirect CALLs and
++       * JMPs gets protection against BHI and Intramode-BTI, but RET
++       * prediction from a non-RSB predictor is still a risk.
++       */
++      if (mode == SPECTRE_V2_EIBRS_LFENCE ||
++          mode == SPECTRE_V2_EIBRS_RETPOLINE ||
++          mode == SPECTRE_V2_RETPOLINE)
++              spec_ctrl_disable_kernel_rrsba();
++
+       spectre_v2_enabled = mode;
+       pr_info("%s\n", spectre_v2_strings[mode]);
+ 
+       /*
+-       * If spectre v2 protection has been enabled, unconditionally fill
+-       * RSB during a context switch; this protects against two independent
+-       * issues:
++       * If Spectre v2 protection has been enabled, fill the RSB during a
++       * context switch.  In general there are two types of RSB attacks
++       * across context switches, for which the CALLs/RETs may be unbalanced.
++       *
++       * 1) RSB underflow
++       *
++       *    Some Intel parts have "bottomless RSB".  When the RSB is empty,
++       *    speculated return targets may come from the branch predictor,
++       *    which could have a user-poisoned BTB or BHB entry.
++       *
++       *    AMD has it even worse: *all* returns are speculated from the BTB,
++       *    regardless of the state of the RSB.
++       *
++       *    When IBRS or eIBRS is enabled, the "user -> kernel" attack
++       *    scenario is mitigated by the IBRS branch prediction isolation
++       *    properties, so the RSB buffer filling wouldn't be necessary to
++       *    protect against this type of attack.
++       *
++       *    The "user -> user" attack scenario is mitigated by RSB filling.
+        *
+-       *      - RSB underflow (and switch to BTB) on Skylake+
+-       *      - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs
++       * 2) Poisoned RSB entry
++       *
++       *    If the 'next' in-kernel return stack is shorter than 'prev',
++       *    'next' could be tricked into speculating with a user-poisoned RSB
++       *    entry.
++       *
++       *    The "user -> kernel" attack scenario is mitigated by SMEP and
++       *    eIBRS.
++       *
++       *    The "user -> user" scenario, also known as SpectreBHB, requires
++       *    RSB clearing.
++       *
++       * So to mitigate all cases, unconditionally fill RSB on context
++       * switches.
++       *
++       * FIXME: Is this pointless for retbleed-affected AMD?
+        */
+       setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
+       pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context 
switch\n");
+@@ -1184,28 +1397,29 @@ static void __init spectre_v2_select_mitigation(void)
+       spectre_v2_determine_rsb_fill_type_at_vmexit(mode);
+ 
+       /*
+-       * Retpoline means the kernel is safe because it has no indirect
+-       * branches. Enhanced IBRS protects firmware too, so, enable restricted
+-       * speculation around firmware calls only when Enhanced IBRS isn't
+-       * supported.
++       * Retpoline protects the kernel, but doesn't protect firmware.  IBRS
++       * and Enhanced IBRS protect firmware too, so enable IBRS around
++       * firmware calls only when IBRS / Enhanced IBRS aren't otherwise
++       * enabled.
+        *
+        * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because
+        * the user might select retpoline on the kernel command line and if
+        * the CPU supports Enhanced IBRS, kernel might un-intentionally not
+        * enable IBRS around firmware calls.
+        */
+-      if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_eibrs_mode(mode)) {
++      if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) {
+               setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
+               pr_info("Enabling Restricted Speculation for firmware calls\n");
+       }
+ 
+       /* Set up IBPB and STIBP depending on the general spectre V2 command */
+-      spectre_v2_user_select_mitigation(cmd);
++      spectre_v2_cmd = cmd;
+ }
+ 
+ static void update_stibp_msr(void * __unused)
+ {
+-      wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
++      u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP);
++      write_spec_ctrl_current(val, true);
+ }
+ 
+ /* Update x86_spec_ctrl_base in case SMT state changed. */
+@@ -1421,16 +1635,6 @@ static enum ssb_mitigation __init 
__ssb_select_mitigation(void)
+               break;
+       }
+ 
+-      /*
+-       * If SSBD is controlled by the SPEC_CTRL MSR, then set the proper
+-       * bit in the mask to allow guests to use the mitigation even in the
+-       * case where the host does not enable it.
+-       */
+-      if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+-          static_cpu_has(X86_FEATURE_AMD_SSBD)) {
+-              x86_spec_ctrl_mask |= SPEC_CTRL_SSBD;
+-      }
+-
+       /*
+        * We have three CPU feature flags that are in play here:
+        *  - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible.
+@@ -1448,7 +1652,7 @@ static enum ssb_mitigation __init 
__ssb_select_mitigation(void)
+                       x86_amd_ssb_disable();
+               } else {
+                       x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
+-                      wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
++                      write_spec_ctrl_current(x86_spec_ctrl_base, true);
+               }
+       }
+ 
+@@ -1665,7 +1869,7 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, 
unsigned long which)
+ void x86_spec_ctrl_setup_ap(void)
+ {
+       if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
+-              wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
++              write_spec_ctrl_current(x86_spec_ctrl_base, true);
+ 
+       if (ssb_mode == SPEC_STORE_BYPASS_DISABLE)
+               x86_amd_ssb_disable();
+@@ -1900,7 +2104,7 @@ static ssize_t mmio_stale_data_show_state(char *buf)
+ 
+ static char *stibp_state(void)
+ {
+-      if (spectre_v2_in_eibrs_mode(spectre_v2_enabled))
++      if (spectre_v2_in_ibrs_mode(spectre_v2_enabled))
+               return "";
+ 
+       switch (spectre_v2_user_stibp) {
+@@ -1934,7 +2138,7 @@ static char *pbrsb_eibrs_state(void)
+ {
+       if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) {
+               if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) ||
+-                  boot_cpu_has(X86_FEATURE_RETPOLINE))
++                  boot_cpu_has(X86_FEATURE_RSB_VMEXIT))
+                       return ", PBRSB-eIBRS: SW sequence";
+               else
+                       return ", PBRSB-eIBRS: Vulnerable";
+@@ -1970,6 +2174,11 @@ static ssize_t srbds_show_state(char *buf)
+       return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]);
+ }
+ 
++static ssize_t retbleed_show_state(char *buf)
++{
++      return sprintf(buf, "%s\n", retbleed_strings[retbleed_mitigation]);
++}
++
+ static ssize_t cpu_show_common(struct device *dev, struct device_attribute 
*attr,
+                              char *buf, unsigned int bug)
+ {
+@@ -2016,6 +2225,9 @@ static ssize_t cpu_show_common(struct device *dev, 
struct device_attribute *attr
+       case X86_BUG_MMIO_UNKNOWN:
+               return mmio_stale_data_show_state(buf);
+ 
++      case X86_BUG_RETBLEED:
++              return retbleed_show_state(buf);
++
+       default:
+               break;
+       }
+@@ -2075,4 +2287,9 @@ ssize_t cpu_show_mmio_stale_data(struct device *dev, 
struct device_attribute *at
+       else
+               return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA);
+ }
++
++ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, 
char *buf)
++{
++      return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED);
++}
+ #endif
+diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
+index 59413e741ecf1..5e1e32f1086ba 100644
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -1102,48 +1102,60 @@ static const __initconst struct x86_cpu_id 
cpu_vuln_whitelist[] = {
+       {}
+ };
+ 
++#define VULNBL(vendor, family, model, blacklist)      \
++      X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist)
++
+ #define VULNBL_INTEL_STEPPINGS(model, steppings, issues)                 \
+       X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6,             \
+                                           INTEL_FAM6_##model, steppings, \
+                                           X86_FEATURE_ANY, issues)
+ 
++#define VULNBL_AMD(family, blacklist)         \
++      VULNBL(AMD, family, X86_MODEL_ANY, blacklist)
++
++#define VULNBL_HYGON(family, blacklist)               \
++      VULNBL(HYGON, family, X86_MODEL_ANY, blacklist)
++
+ #define SRBDS         BIT(0)
+ /* CPU is affected by X86_BUG_MMIO_STALE_DATA */
+ #define MMIO          BIT(1)
+ /* CPU is affected by Shared Buffers Data Sampling (SBDS), a variant of 
X86_BUG_MMIO_STALE_DATA */
+ #define MMIO_SBDS     BIT(2)
++/* CPU is affected by RETbleed, speculating where you would not expect it */
++#define RETBLEED      BIT(3)
+ 
+ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
+       VULNBL_INTEL_STEPPINGS(IVYBRIDGE,       X86_STEPPING_ANY,               
SRBDS),
+       VULNBL_INTEL_STEPPINGS(HASWELL,         X86_STEPPING_ANY,               
SRBDS),
+       VULNBL_INTEL_STEPPINGS(HASWELL_L,       X86_STEPPING_ANY,               
SRBDS),
+       VULNBL_INTEL_STEPPINGS(HASWELL_G,       X86_STEPPING_ANY,               
SRBDS),
+-      VULNBL_INTEL_STEPPINGS(HASWELL_X,       BIT(2) | BIT(4),                
MMIO),
+-      VULNBL_INTEL_STEPPINGS(BROADWELL_D,     X86_STEPPINGS(0x3, 0x5),        
MMIO),
++      VULNBL_INTEL_STEPPINGS(HASWELL_X,       X86_STEPPING_ANY,               
MMIO),
++      VULNBL_INTEL_STEPPINGS(BROADWELL_D,     X86_STEPPING_ANY,               
MMIO),
+       VULNBL_INTEL_STEPPINGS(BROADWELL_G,     X86_STEPPING_ANY,               
SRBDS),
+       VULNBL_INTEL_STEPPINGS(BROADWELL_X,     X86_STEPPING_ANY,               
MMIO),
+       VULNBL_INTEL_STEPPINGS(BROADWELL,       X86_STEPPING_ANY,               
SRBDS),
+-      VULNBL_INTEL_STEPPINGS(SKYLAKE_L,       X86_STEPPINGS(0x3, 0x3),        
SRBDS | MMIO),
+-      VULNBL_INTEL_STEPPINGS(SKYLAKE_L,       X86_STEPPING_ANY,               
SRBDS),
+-      VULNBL_INTEL_STEPPINGS(SKYLAKE_X,       BIT(3) | BIT(4) | BIT(6) |
+-                                              BIT(7) | BIT(0xB),              
MMIO),
+-      VULNBL_INTEL_STEPPINGS(SKYLAKE,         X86_STEPPINGS(0x3, 0x3),        
SRBDS | MMIO),
+-      VULNBL_INTEL_STEPPINGS(SKYLAKE,         X86_STEPPING_ANY,               
SRBDS),
+-      VULNBL_INTEL_STEPPINGS(KABYLAKE_L,      X86_STEPPINGS(0x9, 0xC),        
SRBDS | MMIO),
+-      VULNBL_INTEL_STEPPINGS(KABYLAKE_L,      X86_STEPPINGS(0x0, 0x8),        
SRBDS),
+-      VULNBL_INTEL_STEPPINGS(KABYLAKE,        X86_STEPPINGS(0x9, 0xD),        
SRBDS | MMIO),
+-      VULNBL_INTEL_STEPPINGS(KABYLAKE,        X86_STEPPINGS(0x0, 0x8),        
SRBDS),
+-      VULNBL_INTEL_STEPPINGS(ICELAKE_L,       X86_STEPPINGS(0x5, 0x5),        
MMIO | MMIO_SBDS),
+-      VULNBL_INTEL_STEPPINGS(ICELAKE_D,       X86_STEPPINGS(0x1, 0x1),        
MMIO),
+-      VULNBL_INTEL_STEPPINGS(ICELAKE_X,       X86_STEPPINGS(0x4, 0x6),        
MMIO),
+-      VULNBL_INTEL_STEPPINGS(COMETLAKE,       BIT(2) | BIT(3) | BIT(5),       
MMIO | MMIO_SBDS),
+-      VULNBL_INTEL_STEPPINGS(COMETLAKE_L,     X86_STEPPINGS(0x1, 0x1),        
MMIO | MMIO_SBDS),
+-      VULNBL_INTEL_STEPPINGS(COMETLAKE_L,     X86_STEPPINGS(0x0, 0x0),        
MMIO),
+-      VULNBL_INTEL_STEPPINGS(LAKEFIELD,       X86_STEPPINGS(0x1, 0x1),        
MMIO | MMIO_SBDS),
+-      VULNBL_INTEL_STEPPINGS(ROCKETLAKE,      X86_STEPPINGS(0x1, 0x1),        
MMIO),
+-      VULNBL_INTEL_STEPPINGS(ATOM_TREMONT,    X86_STEPPINGS(0x1, 0x1),        
MMIO | MMIO_SBDS),
++      VULNBL_INTEL_STEPPINGS(SKYLAKE_L,       X86_STEPPING_ANY,               
SRBDS | MMIO | RETBLEED),
++      VULNBL_INTEL_STEPPINGS(SKYLAKE_X,       X86_STEPPING_ANY,               
MMIO | RETBLEED),
++      VULNBL_INTEL_STEPPINGS(SKYLAKE,         X86_STEPPING_ANY,               
SRBDS | MMIO | RETBLEED),
++      VULNBL_INTEL_STEPPINGS(KABYLAKE_L,      X86_STEPPING_ANY,               
SRBDS | MMIO | RETBLEED),
++      VULNBL_INTEL_STEPPINGS(KABYLAKE,        X86_STEPPING_ANY,               
SRBDS | MMIO | RETBLEED),
++      VULNBL_INTEL_STEPPINGS(CANNONLAKE_L,    X86_STEPPING_ANY,               
RETBLEED),
++      VULNBL_INTEL_STEPPINGS(ICELAKE_L,       X86_STEPPING_ANY,               
MMIO | MMIO_SBDS | RETBLEED),
++      VULNBL_INTEL_STEPPINGS(ICELAKE_D,       X86_STEPPING_ANY,               
MMIO),
++      VULNBL_INTEL_STEPPINGS(ICELAKE_X,       X86_STEPPING_ANY,               
MMIO),
++      VULNBL_INTEL_STEPPINGS(COMETLAKE,       X86_STEPPING_ANY,               
MMIO | MMIO_SBDS | RETBLEED),
++      VULNBL_INTEL_STEPPINGS(COMETLAKE_L,     X86_STEPPINGS(0x0, 0x0),        
MMIO | RETBLEED),
++      VULNBL_INTEL_STEPPINGS(COMETLAKE_L,     X86_STEPPING_ANY,               
MMIO | MMIO_SBDS | RETBLEED),
++      VULNBL_INTEL_STEPPINGS(LAKEFIELD,       X86_STEPPING_ANY,               
MMIO | MMIO_SBDS | RETBLEED),
++      VULNBL_INTEL_STEPPINGS(ROCKETLAKE,      X86_STEPPING_ANY,               
MMIO | RETBLEED),
++      VULNBL_INTEL_STEPPINGS(ATOM_TREMONT,    X86_STEPPING_ANY,               
MMIO | MMIO_SBDS),
+       VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D,  X86_STEPPING_ANY,               
MMIO),
+-      VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L,  X86_STEPPINGS(0x0, 0x0),        
MMIO | MMIO_SBDS),
++      VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L,  X86_STEPPING_ANY,               
MMIO | MMIO_SBDS),
++
++      VULNBL_AMD(0x15, RETBLEED),
++      VULNBL_AMD(0x16, RETBLEED),
++      VULNBL_AMD(0x17, RETBLEED),
++      VULNBL_HYGON(0x18, RETBLEED),
+       {}
+ };
+ 
+@@ -1251,6 +1263,11 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 
*c)
+                       setup_force_cpu_bug(X86_BUG_MMIO_UNKNOWN);
+       }
+ 
++      if (!cpu_has(c, X86_FEATURE_BTC_NO)) {
++              if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & 
ARCH_CAP_RSBA))
++                      setup_force_cpu_bug(X86_BUG_RETBLEED);
++      }
++
+       if (cpu_has(c, X86_FEATURE_IBRS_ENHANCED) &&
+           !cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) &&
+           !(ia32_cap & ARCH_CAP_PBRSB_NO))
+diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
+index 2f163e6646b6f..ad6776081e60d 100644
+--- a/arch/x86/kernel/cpu/match.c
++++ b/arch/x86/kernel/cpu/match.c
+@@ -16,12 +16,17 @@
+  * respective wildcard entries.
+  *
+  * A typical table entry would be to match a specific CPU
+- * { X86_VENDOR_INTEL, 6, 0x12 }
+- * or to match a specific CPU feature
+- * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) }
++ *
++ * X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_BROADWELL,
++ *                                  X86_FEATURE_ANY, NULL);
+  *
+  * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY,
+- * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor)
++ * %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor)
++ *
++ * asm/cpu_device_id.h contains a set of useful macros which are shortcuts
++ * for various common selections. The above can be shortened to:
++ *
++ * X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, NULL);
+  *
+  * Arrays used to match for this should also be declared using
+  * MODULE_DEVICE_TABLE(x86cpu, ...)
+diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
+index 53004dbd55c47..a03e309a0ac5f 100644
+--- a/arch/x86/kernel/cpu/scattered.c
++++ b/arch/x86/kernel/cpu/scattered.c
+@@ -26,6 +26,7 @@ struct cpuid_bit {
+ static const struct cpuid_bit cpuid_bits[] = {
+       { X86_FEATURE_APERFMPERF,       CPUID_ECX,  0, 0x00000006, 0 },
+       { X86_FEATURE_EPB,              CPUID_ECX,  3, 0x00000006, 0 },
++      { X86_FEATURE_RRSBA_CTRL,       CPUID_EDX,  2, 0x00000007, 2 },
+       { X86_FEATURE_CQM_LLC,          CPUID_EDX,  1, 0x0000000f, 0 },
+       { X86_FEATURE_CQM_OCCUP_LLC,    CPUID_EDX,  0, 0x0000000f, 1 },
+       { X86_FEATURE_CQM_MBM_TOTAL,    CPUID_EDX,  1, 0x0000000f, 1 },
+diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
+index 068715a52ac10..87cfd2ee9ca0d 100644
+--- a/arch/x86/kernel/process.c
++++ b/arch/x86/kernel/process.c
+@@ -449,7 +449,7 @@ static __always_inline void 
__speculation_ctrl_update(unsigned long tifp,
+       }
+ 
+       if (updmsr)
+-              wrmsrl(MSR_IA32_SPEC_CTRL, msr);
++              write_spec_ctrl_current(msr, false);
+ }
+ 
+ static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
+diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
+index 1efcc7d4bc88e..3db407e3c4166 100644
+--- a/arch/x86/kvm/svm.c
++++ b/arch/x86/kvm/svm.c
+@@ -47,6 +47,7 @@
+ #include <asm/kvm_para.h>
+ #include <asm/irq_remapping.h>
+ #include <asm/spec-ctrl.h>
++#include <asm/cpu_device_id.h>
+ 
+ #include <asm/virtext.h>
+ #include "trace.h"
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index 34ee4835b0177..a7b62a00913e5 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -11,6 +11,7 @@
+ #include "mmu.h"
+ #include "nested.h"
+ #include "trace.h"
++#include "vmx.h"
+ #include "x86.h"
+ 
+ static bool __read_mostly enable_shadow_vmcs = 1;
+@@ -2863,35 +2864,8 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu 
*vcpu)
+               vmx->loaded_vmcs->host_state.cr4 = cr4;
+       }
+ 
+-      asm(
+-              "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust 
RSP for CALL */
+-              "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
+-              "je 1f \n\t"
+-              __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t"
+-              "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
+-              "1: \n\t"
+-              "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
+-
+-              /* Check if vmlaunch or vmresume is needed */
+-              "cmpb $0, %c[launched](%[loaded_vmcs])\n\t"
+-
+-              /*
+-               * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set
+-               * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail
+-               * Valid.  vmx_vmenter() directly "returns" RFLAGS, and so the
+-               * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail.
+-               */
+-              "call vmx_vmenter\n\t"
+-
+-              CC_SET(be)
+-            : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail)
+-            : [HOST_RSP]"r"((unsigned long)HOST_RSP),
+-              [loaded_vmcs]"r"(vmx->loaded_vmcs),
+-              [launched]"i"(offsetof(struct loaded_vmcs, launched)),
+-              [host_state_rsp]"i"(offsetof(struct loaded_vmcs, 
host_state.rsp)),
+-              [wordsize]"i"(sizeof(ulong))
+-            : "memory"
+-      );
++      vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
++                               __vmx_vcpu_run_flags(vmx));
+ 
+       if (vmx->msr_autoload.host.nr)
+               vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h
+new file mode 100644
+index 0000000000000..edc3f16cc1896
+--- /dev/null
++++ b/arch/x86/kvm/vmx/run_flags.h
+@@ -0,0 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef __KVM_X86_VMX_RUN_FLAGS_H
++#define __KVM_X86_VMX_RUN_FLAGS_H
++
++#define VMX_RUN_VMRESUME      (1 << 0)
++#define VMX_RUN_SAVE_SPEC_CTRL        (1 << 1)
++
++#endif /* __KVM_X86_VMX_RUN_FLAGS_H */
+diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
+index 946d9205c3b6d..2850670c38bb0 100644
+--- a/arch/x86/kvm/vmx/vmenter.S
++++ b/arch/x86/kvm/vmx/vmenter.S
+@@ -4,6 +4,7 @@
+ #include <asm/bitsperlong.h>
+ #include <asm/kvm_vcpu_regs.h>
+ #include <asm/nospec-branch.h>
++#include "run_flags.h"
+ 
+ #define WORD_SIZE (BITS_PER_LONG / 8)
+ 
+@@ -29,78 +30,12 @@
+ 
+       .text
+ 
+-/**
+- * vmx_vmenter - VM-Enter the current loaded VMCS
+- *
+- * %RFLAGS.ZF:        !VMCS.LAUNCHED, i.e. controls VMLAUNCH vs. VMRESUME
+- *
+- * Returns:
+- *    %RFLAGS.CF is set on VM-Fail Invalid
+- *    %RFLAGS.ZF is set on VM-Fail Valid
+- *    %RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit
+- *
+- * Note that VMRESUME/VMLAUNCH fall-through and return directly if
+- * they VM-Fail, whereas a successful VM-Enter + VM-Exit will jump
+- * to vmx_vmexit.
+- */
+-ENTRY(vmx_vmenter)
+-      /* EFLAGS.ZF is set if VMCS.LAUNCHED == 0 */
+-      je 2f
+-
+-1:    vmresume
+-      ret
+-
+-2:    vmlaunch
+-      ret
+-
+-3:    cmpb $0, kvm_rebooting
+-      je 4f
+-      ret
+-4:    ud2
+-
+-      .pushsection .fixup, "ax"
+-5:    jmp 3b
+-      .popsection
+-
+-      _ASM_EXTABLE(1b, 5b)
+-      _ASM_EXTABLE(2b, 5b)
+-
+-ENDPROC(vmx_vmenter)
+-
+-/**
+- * vmx_vmexit - Handle a VMX VM-Exit
+- *
+- * Returns:
+- *    %RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit
+- *
+- * This is vmx_vmenter's partner in crime.  On a VM-Exit, control will jump
+- * here after hardware loads the host's state, i.e. this is the destination
+- * referred to by VMCS.HOST_RIP.
+- */
+-ENTRY(vmx_vmexit)
+-#ifdef CONFIG_RETPOLINE
+-      ALTERNATIVE "jmp .Lvmexit_skip_rsb", "", X86_FEATURE_RETPOLINE
+-      /* Preserve guest's RAX, it's used to stuff the RSB. */
+-      push %_ASM_AX
+-
+-      /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
+-      FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
+-
+-      /* Clear RFLAGS.CF and RFLAGS.ZF to preserve VM-Exit, i.e. !VM-Fail. */
+-      or $1, %_ASM_AX
+-
+-      pop %_ASM_AX
+-.Lvmexit_skip_rsb:
+-#endif
+-      ISSUE_UNBALANCED_RET_GUARD X86_FEATURE_RSB_VMEXIT_LITE
+-      ret
+-ENDPROC(vmx_vmexit)
+-
+ /**
+  * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode
+- * @vmx:      struct vcpu_vmx * (forwarded to vmx_update_host_rsp)
++ * @vmx:      struct vcpu_vmx *
+  * @regs:     unsigned long * (to guest registers)
+- * @launched: %true if the VMCS has been launched
++ * @flags:    VMX_RUN_VMRESUME:       use VMRESUME instead of VMLAUNCH
++ *            VMX_RUN_SAVE_SPEC_CTRL: save guest SPEC_CTRL into vmx->spec_ctrl
+  *
+  * Returns:
+  *    0 on VM-Exit, 1 on VM-Fail
+@@ -119,24 +54,29 @@ ENTRY(__vmx_vcpu_run)
+ #endif
+       push %_ASM_BX
+ 
++      /* Save @vmx for SPEC_CTRL handling */
++      push %_ASM_ARG1
++
++      /* Save @flags for SPEC_CTRL handling */
++      push %_ASM_ARG3
++
+       /*
+        * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and
+        * @regs is needed after VM-Exit to save the guest's register values.
+        */
+       push %_ASM_ARG2
+ 
+-      /* Copy @launched to BL, _ASM_ARG3 is volatile. */
++      /* Copy @flags to BL, _ASM_ARG3 is volatile. */
+       mov %_ASM_ARG3B, %bl
+ 
+-      /* Adjust RSP to account for the CALL to vmx_vmenter(). */
+-      lea -WORD_SIZE(%_ASM_SP), %_ASM_ARG2
++      lea (%_ASM_SP), %_ASM_ARG2
+       call vmx_update_host_rsp
+ 
+       /* Load @regs to RAX. */
+       mov (%_ASM_SP), %_ASM_AX
+ 
+       /* Check if vmlaunch or vmresume is needed */
+-      cmpb $0, %bl
++      testb $VMX_RUN_VMRESUME, %bl
+ 
+       /* Load guest registers.  Don't clobber flags. */
+       mov VCPU_RBX(%_ASM_AX), %_ASM_BX
+@@ -158,11 +98,25 @@ ENTRY(__vmx_vcpu_run)
+       /* Load guest RAX.  This kills the @regs pointer! */
+       mov VCPU_RAX(%_ASM_AX), %_ASM_AX
+ 
+-      /* Enter guest mode */
+-      call vmx_vmenter
++      /* Check EFLAGS.ZF from 'testb' above */
++      jz .Lvmlaunch
+ 
+-      /* Jump on VM-Fail. */
+-      jbe 2f
++/*
++ * If VMRESUME/VMLAUNCH and corresponding vmexit succeed, execution resumes at
++ * the 'vmx_vmexit' label below.
++ */
++.Lvmresume:
++      vmresume
++      jmp .Lvmfail
++
++.Lvmlaunch:
++      vmlaunch
++      jmp .Lvmfail
++
++      _ASM_EXTABLE(.Lvmresume, .Lfixup)
++      _ASM_EXTABLE(.Lvmlaunch, .Lfixup)
++
++SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL)
+ 
+       /* Temporarily save guest's RAX. */
+       push %_ASM_AX
+@@ -189,19 +143,21 @@ ENTRY(__vmx_vcpu_run)
+       mov %r15, VCPU_R15(%_ASM_AX)
+ #endif
+ 
+-      /* Clear RAX to indicate VM-Exit (as opposed to VM-Fail). */
+-      xor %eax, %eax
++      /* Clear return value to indicate VM-Exit (as opposed to VM-Fail). */
++      xor %ebx, %ebx
+ 
++.Lclear_regs:
+       /*
+-       * Clear all general purpose registers except RSP and RAX to prevent
++       * Clear all general purpose registers except RSP and RBX to prevent
+        * speculative use of the guest's values, even those that are reloaded
+        * via the stack.  In theory, an L1 cache miss when restoring registers
+        * could lead to speculative execution with the guest's values.
+        * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially
+        * free.  RSP and RAX are exempt as RSP is restored by hardware during
+-       * VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail.
++       * VM-Exit and RBX is explicitly loaded with 0 or 1 to hold the return
++       * value.
+        */
+-1:    xor %ebx, %ebx
++      xor %eax, %eax
+       xor %ecx, %ecx
+       xor %edx, %edx
+       xor %esi, %esi
+@@ -220,8 +176,32 @@ ENTRY(__vmx_vcpu_run)
+ 
+       /* "POP" @regs. */
+       add $WORD_SIZE, %_ASM_SP
+-      pop %_ASM_BX
+ 
++      /*
++       * IMPORTANT: RSB filling and SPEC_CTRL handling must be done before
++       * the first unbalanced RET after vmexit!
++       *
++       * For retpoline or IBRS, RSB filling is needed to prevent poisoned RSB
++       * entries and (in some cases) RSB underflow.
++       *
++       * eIBRS has its own protection against poisoned RSB, so it doesn't
++       * need the RSB filling sequence.  But it does need to be enabled, and a
++       * single call to retire, before the first unbalanced RET.
++         */
++
++      FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT,\
++                         X86_FEATURE_RSB_VMEXIT_LITE
++
++
++      pop %_ASM_ARG2  /* @flags */
++      pop %_ASM_ARG1  /* @vmx */
++
++      call vmx_spec_ctrl_restore_host
++
++      /* Put return value in AX */
++      mov %_ASM_BX, %_ASM_AX
++
++      pop %_ASM_BX
+ #ifdef CONFIG_X86_64
+       pop %r12
+       pop %r13
+@@ -234,11 +214,20 @@ ENTRY(__vmx_vcpu_run)
+       pop %_ASM_BP
+       ret
+ 
+-      /* VM-Fail.  Out-of-line to avoid a taken Jcc after VM-Exit. */
+-2:    mov $1, %eax
+-      jmp 1b
++.Lfixup:
++      cmpb $0, kvm_rebooting
++      jne .Lvmfail
++      ud2
++.Lvmfail:
++      /* VM-Fail: set return value to 1 */
++      mov $1, %_ASM_BX
++      jmp .Lclear_regs
++
+ ENDPROC(__vmx_vcpu_run)
+ 
++
++.section .text, "ax"
++
+ /**
+  * vmread_error_trampoline - Trampoline from inline asm to vmread_error()
+  * @field:    VMCS field encoding that failed
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 4bd1bf6214eea..d522c9de41df9 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -31,6 +31,7 @@
+ #include <asm/apic.h>
+ #include <asm/asm.h>
+ #include <asm/cpu.h>
++#include <asm/cpu_device_id.h>
+ #include <asm/debugreg.h>
+ #include <asm/desc.h>
+ #include <asm/fpu/internal.h>
+@@ -358,9 +359,9 @@ static __always_inline void vmx_disable_fb_clear(struct 
vcpu_vmx *vmx)
+       if (!vmx->disable_fb_clear)
+               return;
+ 
+-      rdmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
++      msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL);
+       msr |= FB_CLEAR_DIS;
+-      wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
++      native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
+       /* Cache the MSR value to avoid reading it later */
+       vmx->msr_ia32_mcu_opt_ctrl = msr;
+ }
+@@ -371,7 +372,7 @@ static __always_inline void vmx_enable_fb_clear(struct 
vcpu_vmx *vmx)
+               return;
+ 
+       vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
+-      wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
++      native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
+ }
+ 
+ static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx 
*vmx)
+@@ -862,6 +863,24 @@ static bool msr_write_intercepted(struct vcpu_vmx *vmx, 
u32 msr)
+       return true;
+ }
+ 
++unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
++{
++      unsigned int flags = 0;
++
++      if (vmx->loaded_vmcs->launched)
++              flags |= VMX_RUN_VMRESUME;
++
++      /*
++       * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free
++       * to change it directly without causing a vmexit.  In that case read
++       * it after vmexit and store it in vmx->spec_ctrl.
++       */
++      if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)))
++              flags |= VMX_RUN_SAVE_SPEC_CTRL;
++
++      return flags;
++}
++
+ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
+               unsigned long entry, unsigned long exit)
+ {
+@@ -6539,7 +6558,30 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned 
long host_rsp)
+       }
+ }
+ 
+-bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
++void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
++                                      unsigned int flags)
++{
++      u64 hostval = this_cpu_read(x86_spec_ctrl_current);
++
++      if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
++              return;
++
++      if (flags & VMX_RUN_SAVE_SPEC_CTRL)
++              vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL);
++
++      /*
++       * If the guest/host SPEC_CTRL values differ, restore the host value.
++       *
++       * For legacy IBRS, the IBRS bit always needs to be written after
++       * transitioning from a less privileged predictor mode, regardless of
++       * whether the guest/host values differ.
++       */
++      if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
++          vmx->spec_ctrl != hostval)
++              native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval);
++
++      barrier_nospec();
++}
+ 
+ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
+ {
+@@ -6628,32 +6670,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
+               write_cr2(vcpu->arch.cr2);
+ 
+       vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
+-                                 vmx->loaded_vmcs->launched);
++                                 __vmx_vcpu_run_flags(vmx));
+ 
+       vcpu->arch.cr2 = read_cr2();
+ 
+       vmx_enable_fb_clear(vmx);
+ 
+-      /*
+-       * We do not use IBRS in the kernel. If this vCPU has used the
+-       * SPEC_CTRL MSR it may have left it on; save the value and
+-       * turn it off. This is much more efficient than blindly adding
+-       * it to the atomic save/restore list. Especially as the former
+-       * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
+-       *
+-       * For non-nested case:
+-       * If the L01 MSR bitmap does not intercept the MSR, then we need to
+-       * save it.
+-       *
+-       * For nested case:
+-       * If the L02 MSR bitmap does not intercept the MSR, then we need to
+-       * save it.
+-       */
+-      if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)))
+-              vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
+-
+-      x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
+-
+       /* All fields are clean at this point */
+       if (static_branch_unlikely(&enable_evmcs))
+               current_evmcs->hv_clean_fields |=
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index 7a3362ab59867..4d5be4610af84 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -10,6 +10,7 @@
+ #include "capabilities.h"
+ #include "ops.h"
+ #include "vmcs.h"
++#include "run_flags.h"
+ 
+ extern const u32 vmx_msr_index[];
+ extern u64 host_efer;
+@@ -336,6 +337,10 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
+ struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr);
+ void pt_update_intercept_for_msr(struct vcpu_vmx *vmx);
+ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp);
++void vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, unsigned int flags);
++unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx);
++bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs,
++                  unsigned int flags);
+ 
+ #define POSTED_INTR_ON  0
+ #define POSTED_INTR_SN  1
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index d0b297583df88..c431a34522d6c 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -10329,9 +10329,9 @@ void kvm_arch_end_assignment(struct kvm *kvm)
+ }
+ EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
+ 
+-bool kvm_arch_has_assigned_device(struct kvm *kvm)
++bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
+ {
+-      return atomic_read(&kvm->arch.assigned_device_count);
++      return arch_atomic_read(&kvm->arch.assigned_device_count);
+ }
+ EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
+ 
+diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
+index 9b5edf1dfe9e9..7000c836951c5 100644
+--- a/drivers/base/cpu.c
++++ b/drivers/base/cpu.c
+@@ -574,6 +574,12 @@ ssize_t __weak cpu_show_mmio_stale_data(struct device 
*dev,
+       return sysfs_emit(buf, "Not affected\n");
+ }
+ 
++ssize_t __weak cpu_show_retbleed(struct device *dev,
++                               struct device_attribute *attr, char *buf)
++{
++      return sysfs_emit(buf, "Not affected\n");
++}
++
+ static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
+ static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
+ static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
+@@ -584,6 +590,7 @@ static DEVICE_ATTR(tsx_async_abort, 0444, 
cpu_show_tsx_async_abort, NULL);
+ static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL);
+ static DEVICE_ATTR(srbds, 0444, cpu_show_srbds, NULL);
+ static DEVICE_ATTR(mmio_stale_data, 0444, cpu_show_mmio_stale_data, NULL);
++static DEVICE_ATTR(retbleed, 0444, cpu_show_retbleed, NULL);
+ 
+ static struct attribute *cpu_root_vulnerabilities_attrs[] = {
+       &dev_attr_meltdown.attr,
+@@ -596,6 +603,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] 
= {
+       &dev_attr_itlb_multihit.attr,
+       &dev_attr_srbds.attr,
+       &dev_attr_mmio_stale_data.attr,
++      &dev_attr_retbleed.attr,
+       NULL
+ };
+ 
+diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
+index 4195834a45912..cf7ebe3bd1ad2 100644
+--- a/drivers/cpufreq/acpi-cpufreq.c
++++ b/drivers/cpufreq/acpi-cpufreq.c
+@@ -30,6 +30,7 @@
+ #include <asm/msr.h>
+ #include <asm/processor.h>
+ #include <asm/cpufeature.h>
++#include <asm/cpu_device_id.h>
+ 
+ MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
+ MODULE_DESCRIPTION("ACPI Processor P-States Driver");
+diff --git a/drivers/cpufreq/amd_freq_sensitivity.c 
b/drivers/cpufreq/amd_freq_sensitivity.c
+index e2df9d1121063..5107cbe2d64dd 100644
+--- a/drivers/cpufreq/amd_freq_sensitivity.c
++++ b/drivers/cpufreq/amd_freq_sensitivity.c
+@@ -18,6 +18,7 @@
+ 
+ #include <asm/msr.h>
+ #include <asm/cpufeature.h>
++#include <asm/cpu_device_id.h>
+ 
+ #include "cpufreq_ondemand.h"
+ 
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
+index d8687868407de..b588e0e409e72 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
+@@ -35,7 +35,6 @@
+ #include <linux/pci.h>
+ #include <linux/pm_runtime.h>
+ #include <drm/drm_crtc_helper.h>
+-#include <drm/drm_damage_helper.h>
+ #include <drm/drm_edid.h>
+ #include <drm/drm_gem_framebuffer_helper.h>
+ #include <drm/drm_fb_helper.h>
+@@ -496,7 +495,6 @@ bool amdgpu_display_ddc_probe(struct amdgpu_connector 
*amdgpu_connector,
+ static const struct drm_framebuffer_funcs amdgpu_fb_funcs = {
+       .destroy = drm_gem_fb_destroy,
+       .create_handle = drm_gem_fb_create_handle,
+-      .dirty = drm_atomic_helper_dirtyfb,
+ };
+ 
+ uint32_t amdgpu_display_supported_domains(struct amdgpu_device *adev,
+diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
+index 347b08b56042f..63b2212262618 100644
+--- a/drivers/idle/intel_idle.c
++++ b/drivers/idle/intel_idle.c
+@@ -46,11 +46,13 @@
+ #include <linux/tick.h>
+ #include <trace/events/power.h>
+ #include <linux/sched.h>
++#include <linux/sched/smt.h>
+ #include <linux/notifier.h>
+ #include <linux/cpu.h>
+ #include <linux/moduleparam.h>
+ #include <asm/cpu_device_id.h>
+ #include <asm/intel-family.h>
++#include <asm/nospec-branch.h>
+ #include <asm/mwait.h>
+ #include <asm/msr.h>
+ 
+@@ -97,6 +99,12 @@ static struct cpuidle_state *cpuidle_state_table;
+  */
+ #define CPUIDLE_FLAG_TLB_FLUSHED      0x10000
+ 
++/*
++ * Disable IBRS across idle (when KERNEL_IBRS), is exclusive vs IRQ_ENABLE
++ * above.
++ */
++#define CPUIDLE_FLAG_IBRS             BIT(16)
++
+ /*
+  * MWAIT takes an 8-bit "hint" in EAX "suggesting"
+  * the C-state (top nibble) and sub-state (bottom nibble)
+@@ -107,6 +115,24 @@ static struct cpuidle_state *cpuidle_state_table;
+ #define flg2MWAIT(flags) (((flags) >> 24) & 0xFF)
+ #define MWAIT2flg(eax) ((eax & 0xFF) << 24)
+ 
++static __cpuidle int intel_idle_ibrs(struct cpuidle_device *dev,
++                                   struct cpuidle_driver *drv, int index)
++{
++      bool smt_active = sched_smt_active();
++      u64 spec_ctrl = spec_ctrl_current();
++      int ret;
++
++      if (smt_active)
++              wrmsrl(MSR_IA32_SPEC_CTRL, 0);
++
++      ret = intel_idle(dev, drv, index);
++
++      if (smt_active)
++              wrmsrl(MSR_IA32_SPEC_CTRL, spec_ctrl);
++
++      return ret;
++}
++
+ /*
+  * States are indexed by the cstate number,
+  * which is also the index into the MWAIT hint array.
+@@ -605,7 +631,7 @@ static struct cpuidle_state skl_cstates[] = {
+       {
+               .name = "C6",
+               .desc = "MWAIT 0x20",
+-              .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
++              .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | 
CPUIDLE_FLAG_IBRS,
+               .exit_latency = 85,
+               .target_residency = 200,
+               .enter = &intel_idle,
+@@ -613,7 +639,7 @@ static struct cpuidle_state skl_cstates[] = {
+       {
+               .name = "C7s",
+               .desc = "MWAIT 0x33",
+-              .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED,
++              .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED | 
CPUIDLE_FLAG_IBRS,
+               .exit_latency = 124,
+               .target_residency = 800,
+               .enter = &intel_idle,
+@@ -621,7 +647,7 @@ static struct cpuidle_state skl_cstates[] = {
+       {
+               .name = "C8",
+               .desc = "MWAIT 0x40",
+-              .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
++              .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED | 
CPUIDLE_FLAG_IBRS,
+               .exit_latency = 200,
+               .target_residency = 800,
+               .enter = &intel_idle,
+@@ -629,7 +655,7 @@ static struct cpuidle_state skl_cstates[] = {
+       {
+               .name = "C9",
+               .desc = "MWAIT 0x50",
+-              .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
++              .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED | 
CPUIDLE_FLAG_IBRS,
+               .exit_latency = 480,
+               .target_residency = 5000,
+               .enter = &intel_idle,
+@@ -637,7 +663,7 @@ static struct cpuidle_state skl_cstates[] = {
+       {
+               .name = "C10",
+               .desc = "MWAIT 0x60",
+-              .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
++              .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED | 
CPUIDLE_FLAG_IBRS,
+               .exit_latency = 890,
+               .target_residency = 5000,
+               .enter = &intel_idle,
+@@ -666,7 +692,7 @@ static struct cpuidle_state skx_cstates[] = {
+       {
+               .name = "C6",
+               .desc = "MWAIT 0x20",
+-              .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
++              .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | 
CPUIDLE_FLAG_IBRS,
+               .exit_latency = 133,
+               .target_residency = 600,
+               .enter = &intel_idle,
+@@ -1370,6 +1396,11 @@ static void __init intel_idle_cpuidle_driver_init(void)
+               drv->states[drv->state_count] = /* structure copy */
+                       cpuidle_state_table[cstate];
+ 
++              if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) &&
++                  cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_IBRS) {
++                      drv->states[drv->state_count].enter = intel_idle_ibrs;
++              }
++
+               drv->state_count += 1;
+       }
+ 
+diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
+index 510ca69746042..c83ff610ecb6c 100644
+--- a/fs/xfs/libxfs/xfs_attr.c
++++ b/fs/xfs/libxfs/xfs_attr.c
+@@ -1007,7 +1007,7 @@ restart:
+                * The INCOMPLETE flag means that we will find the "old"
+                * attr, not the "new" one.
+                */
+-              args->flags |= XFS_ATTR_INCOMPLETE;
++              args->op_flags |= XFS_DA_OP_INCOMPLETE;
+               state = xfs_da_state_alloc();
+               state->args = args;
+               state->mp = mp;
+diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
+index 0c23127347aca..c86ddbf6d105b 100644
+--- a/fs/xfs/libxfs/xfs_attr_leaf.c
++++ b/fs/xfs/libxfs/xfs_attr_leaf.c
+@@ -2345,8 +2345,8 @@ xfs_attr3_leaf_lookup_int(
+                * If we are looking for INCOMPLETE entries, show only those.
+                * If we are looking for complete entries, show only those.
+                */
+-              if ((args->flags & XFS_ATTR_INCOMPLETE) !=
+-                  (entry->flags & XFS_ATTR_INCOMPLETE)) {
++              if (!!(args->op_flags & XFS_DA_OP_INCOMPLETE) !=
++                  !!(entry->flags & XFS_ATTR_INCOMPLETE)) {
+                       continue;
+               }
+               if (entry->flags & XFS_ATTR_LOCAL) {
+diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
+index 7b74e18becff7..38c05d6ae2aa4 100644
+--- a/fs/xfs/libxfs/xfs_attr_leaf.h
++++ b/fs/xfs/libxfs/xfs_attr_leaf.h
+@@ -17,13 +17,27 @@ struct xfs_inode;
+ struct xfs_trans;
+ 
+ /*
+- * Used to keep a list of "remote value" extents when unlinking an inode.
++ * Incore version of the attribute leaf header.
+  */
+-typedef struct xfs_attr_inactive_list {
+-      xfs_dablk_t     valueblk;       /* block number of value bytes */
+-      int             valuelen;       /* number of bytes in value */
+-} xfs_attr_inactive_list_t;
+-
++struct xfs_attr3_icleaf_hdr {
++      uint32_t        forw;
++      uint32_t        back;
++      uint16_t        magic;
++      uint16_t        count;
++      uint16_t        usedbytes;
++      /*
++       * Firstused is 32-bit here instead of 16-bit like the on-disk variant
++       * to support maximum fsb size of 64k without overflow issues throughout
++       * the attr code. Instead, the overflow condition is handled on
++       * conversion to/from disk.
++       */
++      uint32_t        firstused;
++      __u8            holes;
++      struct {
++              uint16_t        base;
++              uint16_t        size;
++      } freemap[XFS_ATTR_LEAF_MAPSIZE];
++};
+ 
+ /*========================================================================
+  * Function prototypes for the kernel.
+diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
+index 3e39b7d40f256..de9096b8a47c6 100644
+--- a/fs/xfs/libxfs/xfs_attr_remote.c
++++ b/fs/xfs/libxfs/xfs_attr_remote.c
+@@ -24,6 +24,23 @@
+ 
+ #define ATTR_RMTVALUE_MAPSIZE 1       /* # of map entries at once */
+ 
++/*
++ * Remote Attribute Values
++ * =======================
++ *
++ * Remote extended attribute values are conceptually simple -- they're written
++ * to data blocks mapped by an inode's attribute fork, and they have an upper
++ * size limit of 64k.  Setting a value does not involve the XFS log.
++ *
++ * However, on a v5 filesystem, maximally sized remote attr values require one
++ * block more than 64k worth of space to hold both the remote attribute value
++ * header (64 bytes).  On a 4k block filesystem this results in a 68k buffer;
++ * on a 64k block filesystem, this would be a 128k buffer.  Note that the log
++ * format can only handle a dirty buffer of XFS_MAX_BLOCKSIZE length (64k).
++ * Therefore, we /must/ ensure that remote attribute value buffers never touch
++ * the logging system and therefore never have a log item.
++ */
++
+ /*
+  * Each contiguous block has a header, so it is not just a simple attribute
+  * length to FSB conversion.
+@@ -400,17 +417,25 @@ xfs_attr_rmtval_get(
+                              (map[i].br_startblock != HOLESTARTBLOCK));
+                       dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
+                       dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+-                      error = xfs_trans_read_buf(mp, args->trans,
+-                                                 mp->m_ddev_targp,
+-                                                 dblkno, dblkcnt, 0, &bp,
+-                                                 &xfs_attr3_rmt_buf_ops);
+-                      if (error)
++                      bp = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt, 0,
++                                      &xfs_attr3_rmt_buf_ops);
++                      if (!bp)
++                              return -ENOMEM;
++                      error = bp->b_error;
++                      if (error) {
++                              xfs_buf_ioerror_alert(bp, __func__);
++                              xfs_buf_relse(bp);
++
++                              /* bad CRC means corrupted metadata */
++                              if (error == -EFSBADCRC)
++                                      error = -EFSCORRUPTED;
+                               return error;
++                      }
+ 
+                       error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
+                                                       &offset, &valuelen,
+                                                       &dst);
+-                      xfs_trans_brelse(args->trans, bp);
++                      xfs_buf_relse(bp);
+                       if (error)
+                               return error;
+ 
+@@ -551,6 +576,32 @@ xfs_attr_rmtval_set(
+       return 0;
+ }
+ 
++/* Mark stale any incore buffers for the remote value. */
++int
++xfs_attr_rmtval_stale(
++      struct xfs_inode        *ip,
++      struct xfs_bmbt_irec    *map,
++      xfs_buf_flags_t         incore_flags)
++{
++      struct xfs_mount        *mp = ip->i_mount;
++      struct xfs_buf          *bp;
++
++      ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
++
++      ASSERT((map->br_startblock != DELAYSTARTBLOCK) &&
++             (map->br_startblock != HOLESTARTBLOCK));
++
++      bp = xfs_buf_incore(mp->m_ddev_targp,
++                      XFS_FSB_TO_DADDR(mp, map->br_startblock),
++                      XFS_FSB_TO_BB(mp, map->br_blockcount), incore_flags);
++      if (bp) {
++              xfs_buf_stale(bp);
++              xfs_buf_relse(bp);
++      }
++
++      return 0;
++}
++
+ /*
+  * Remove the value associated with an attribute by deleting the
+  * out-of-line buffer that it is stored on.
+@@ -559,7 +610,6 @@ int
+ xfs_attr_rmtval_remove(
+       struct xfs_da_args      *args)
+ {
+-      struct xfs_mount        *mp = args->dp->i_mount;
+       xfs_dablk_t             lblkno;
+       int                     blkcnt;
+       int                     error;
+@@ -574,9 +624,6 @@ xfs_attr_rmtval_remove(
+       blkcnt = args->rmtblkcnt;
+       while (blkcnt > 0) {
+               struct xfs_bmbt_irec    map;
+-              struct xfs_buf          *bp;
+-              xfs_daddr_t             dblkno;
+-              int                     dblkcnt;
+               int                     nmap;
+ 
+               /*
+@@ -588,21 +635,9 @@ xfs_attr_rmtval_remove(
+               if (error)
+                       return error;
+               ASSERT(nmap == 1);
+-              ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+-                     (map.br_startblock != HOLESTARTBLOCK));
+-
+-              dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+-              dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+-
+-              /*
+-               * If the "remote" value is in the cache, remove it.
+-               */
+-              bp = xfs_buf_incore(mp->m_ddev_targp, dblkno, dblkcnt, 
XBF_TRYLOCK);
+-              if (bp) {
+-                      xfs_buf_stale(bp);
+-                      xfs_buf_relse(bp);
+-                      bp = NULL;
+-              }
++              error = xfs_attr_rmtval_stale(args->dp, &map, XBF_TRYLOCK);
++              if (error)
++                      return error;
+ 
+               lblkno += map.br_blockcount;
+               blkcnt -= map.br_blockcount;
+diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h
+index 9d20b66ad379e..6fb4572845ce8 100644
+--- a/fs/xfs/libxfs/xfs_attr_remote.h
++++ b/fs/xfs/libxfs/xfs_attr_remote.h
+@@ -11,5 +11,7 @@ int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
+ int xfs_attr_rmtval_get(struct xfs_da_args *args);
+ int xfs_attr_rmtval_set(struct xfs_da_args *args);
+ int xfs_attr_rmtval_remove(struct xfs_da_args *args);
++int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map,
++              xfs_buf_flags_t incore_flags);
+ 
+ #endif /* __XFS_ATTR_REMOTE_H__ */
+diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
+index ae0bbd20d9caf..588e4674e931f 100644
+--- a/fs/xfs/libxfs/xfs_da_btree.h
++++ b/fs/xfs/libxfs/xfs_da_btree.h
+@@ -82,6 +82,7 @@ typedef struct xfs_da_args {
+ #define XFS_DA_OP_OKNOENT     0x0008  /* lookup/add op, ENOENT ok, else die */
+ #define XFS_DA_OP_CILOOKUP    0x0010  /* lookup to return CI name if found */
+ #define XFS_DA_OP_ALLOCVAL    0x0020  /* lookup to alloc buffer if found  */
++#define XFS_DA_OP_INCOMPLETE  0x0040  /* lookup INCOMPLETE attr keys */
+ 
+ #define XFS_DA_OP_FLAGS \
+       { XFS_DA_OP_JUSTCHECK,  "JUSTCHECK" }, \
+@@ -89,7 +90,8 @@ typedef struct xfs_da_args {
+       { XFS_DA_OP_ADDNAME,    "ADDNAME" }, \
+       { XFS_DA_OP_OKNOENT,    "OKNOENT" }, \
+       { XFS_DA_OP_CILOOKUP,   "CILOOKUP" }, \
+-      { XFS_DA_OP_ALLOCVAL,   "ALLOCVAL" }
++      { XFS_DA_OP_ALLOCVAL,   "ALLOCVAL" }, \
++      { XFS_DA_OP_INCOMPLETE, "INCOMPLETE" }
+ 
+ /*
+  * Storage for holding state during Btree searches and split/join ops.
+@@ -124,6 +126,19 @@ typedef struct xfs_da_state {
+                                               /* for dirv2 extrablk is data */
+ } xfs_da_state_t;
+ 
++/*
++ * In-core version of the node header to abstract the differences in the v2 
and
++ * v3 disk format of the headers. Callers need to convert to/from disk format 
as
++ * appropriate.
++ */
++struct xfs_da3_icnode_hdr {
++      uint32_t                forw;
++      uint32_t                back;
++      uint16_t                magic;
++      uint16_t                count;
++      uint16_t                level;
++};
++
+ /*
+  * Utility macros to aid in logging changed structure fields.
+  */
+diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
+index b1ae572496b69..31bb250c18992 100644
+--- a/fs/xfs/libxfs/xfs_da_format.c
++++ b/fs/xfs/libxfs/xfs_da_format.c
+@@ -13,6 +13,7 @@
+ #include "xfs_mount.h"
+ #include "xfs_inode.h"
+ #include "xfs_dir2.h"
++#include "xfs_dir2_priv.h"
+ 
+ /*
+  * Shortform directory ops
+diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
+index ae654e06b2fb6..222ee48da5e80 100644
+--- a/fs/xfs/libxfs/xfs_da_format.h
++++ b/fs/xfs/libxfs/xfs_da_format.h
+@@ -93,19 +93,6 @@ struct xfs_da3_intnode {
+       struct xfs_da_node_entry __btree[];
+ };
+ 
+-/*
+- * In-core version of the node header to abstract the differences in the v2 
and
+- * v3 disk format of the headers. Callers need to convert to/from disk format 
as
+- * appropriate.
+- */
+-struct xfs_da3_icnode_hdr {
+-      uint32_t        forw;
+-      uint32_t        back;
+-      uint16_t        magic;
+-      uint16_t        count;
+-      uint16_t        level;
+-};
+-
+ /*
+  * Directory version 2.
+  *
+@@ -434,14 +421,6 @@ struct xfs_dir3_leaf_hdr {
+       __be32                  pad;            /* 64 bit alignment */
+ };
+ 
+-struct xfs_dir3_icleaf_hdr {
+-      uint32_t                forw;
+-      uint32_t                back;
+-      uint16_t                magic;
+-      uint16_t                count;
+-      uint16_t                stale;
+-};
+-
+ /*
+  * Leaf block entry.
+  */
+@@ -520,19 +499,6 @@ struct xfs_dir3_free {
+ 
+ #define XFS_DIR3_FREE_CRC_OFF  offsetof(struct xfs_dir3_free, hdr.hdr.crc)
+ 
+-/*
+- * In core version of the free block header, abstracted away from on-disk 
format
+- * differences. Use this in the code, and convert to/from the disk version 
using
+- * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk.
+- */
+-struct xfs_dir3_icfree_hdr {
+-      uint32_t        magic;
+-      uint32_t        firstdb;
+-      uint32_t        nvalid;
+-      uint32_t        nused;
+-
+-};
+-
+ /*
+  * Single block format.
+  *
+@@ -709,29 +675,6 @@ struct xfs_attr3_leafblock {
+        */
+ };
+ 
+-/*
+- * incore, neutral version of the attribute leaf header
+- */
+-struct xfs_attr3_icleaf_hdr {
+-      uint32_t        forw;
+-      uint32_t        back;
+-      uint16_t        magic;
+-      uint16_t        count;
+-      uint16_t        usedbytes;
+-      /*
+-       * firstused is 32-bit here instead of 16-bit like the on-disk variant
+-       * to support maximum fsb size of 64k without overflow issues throughout
+-       * the attr code. Instead, the overflow condition is handled on
+-       * conversion to/from disk.
+-       */
+-      uint32_t        firstused;
+-      __u8            holes;
+-      struct {
+-              uint16_t        base;
+-              uint16_t        size;
+-      } freemap[XFS_ATTR_LEAF_MAPSIZE];
+-};
+-
+ /*
+  * Special value to represent fs block size in the leaf header firstused 
field.
+  * Only used when block size overflows the 2-bytes available on disk.
+@@ -740,8 +683,6 @@ struct xfs_attr3_icleaf_hdr {
+ 
+ /*
+  * Flags used in the leaf_entry[i].flags field.
+- * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
+- * on the system call, they are "or"ed together for various operations.
+  */
+ #define       XFS_ATTR_LOCAL_BIT      0       /* attr is stored locally */
+ #define       XFS_ATTR_ROOT_BIT       1       /* limit access to trusted 
attrs */
+diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
+index f542447794928..e170792c0acce 100644
+--- a/fs/xfs/libxfs/xfs_dir2.h
++++ b/fs/xfs/libxfs/xfs_dir2.h
+@@ -18,6 +18,8 @@ struct xfs_dir2_sf_entry;
+ struct xfs_dir2_data_hdr;
+ struct xfs_dir2_data_entry;
+ struct xfs_dir2_data_unused;
++struct xfs_dir3_icfree_hdr;
++struct xfs_dir3_icleaf_hdr;
+ 
+ extern struct xfs_name        xfs_name_dotdot;
+ 
+diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
+index 59f9fb2241a5f..d2eaea663e7f2 100644
+--- a/fs/xfs/libxfs/xfs_dir2_priv.h
++++ b/fs/xfs/libxfs/xfs_dir2_priv.h
+@@ -8,6 +8,25 @@
+ 
+ struct dir_context;
+ 
++/*
++ * In-core version of the leaf and free block headers to abstract the
++ * differences in the v2 and v3 disk format of the headers.
++ */
++struct xfs_dir3_icleaf_hdr {
++      uint32_t                forw;
++      uint32_t                back;
++      uint16_t                magic;
++      uint16_t                count;
++      uint16_t                stale;
++};
++
++struct xfs_dir3_icfree_hdr {
++      uint32_t                magic;
++      uint32_t                firstdb;
++      uint32_t                nvalid;
++      uint32_t                nused;
++};
++
+ /* xfs_dir2.c */
+ extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
+                               xfs_dir2_db_t *dbp);
+diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
+index c968b60cee15b..28203b626f6a2 100644
+--- a/fs/xfs/libxfs/xfs_format.h
++++ b/fs/xfs/libxfs/xfs_format.h
+@@ -1540,6 +1540,13 @@ typedef struct xfs_bmdr_block {
+ #define BMBT_BLOCKCOUNT_BITLEN        21
+ 
+ #define BMBT_STARTOFF_MASK    ((1ULL << BMBT_STARTOFF_BITLEN) - 1)
++#define BMBT_BLOCKCOUNT_MASK  ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1)
++
++/*
++ * bmbt records have a file offset (block) field that is 54 bits wide, so this
++ * is the largest xfs_fileoff_t that we ever expect to see.
++ */
++#define XFS_MAX_FILEOFF               (BMBT_STARTOFF_MASK + 
BMBT_BLOCKCOUNT_MASK)
+ 
+ typedef struct xfs_bmbt_rec {
+       __be64                  l0, l1;
+diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
+index 766b1386402a0..9c88203b537b1 100644
+--- a/fs/xfs/xfs_attr_inactive.c
++++ b/fs/xfs/xfs_attr_inactive.c
+@@ -25,22 +25,18 @@
+ #include "xfs_error.h"
+ 
+ /*
+- * Look at all the extents for this logical region,
+- * invalidate any buffers that are incore/in transactions.
++ * Invalidate any incore buffers associated with this remote attribute value
++ * extent.   We never log remote attribute value buffers, which means that 
they
++ * won't be attached to a transaction and are therefore safe to mark stale.
++ * The actual bunmapi will be taken care of later.
+  */
+ STATIC int
+-xfs_attr3_leaf_freextent(
+-      struct xfs_trans        **trans,
++xfs_attr3_rmt_stale(
+       struct xfs_inode        *dp,
+       xfs_dablk_t             blkno,
+       int                     blkcnt)
+ {
+       struct xfs_bmbt_irec    map;
+-      struct xfs_buf          *bp;
+-      xfs_dablk_t             tblkno;
+-      xfs_daddr_t             dblkno;
+-      int                     tblkcnt;
+-      int                     dblkcnt;
+       int                     nmap;
+       int                     error;
+ 
+@@ -48,47 +44,28 @@ xfs_attr3_leaf_freextent(
+        * Roll through the "value", invalidating the attribute value's
+        * blocks.
+        */
+-      tblkno = blkno;
+-      tblkcnt = blkcnt;
+-      while (tblkcnt > 0) {
++      while (blkcnt > 0) {
+               /*
+                * Try to remember where we decided to put the value.
+                */
+               nmap = 1;
+-              error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
++              error = xfs_bmapi_read(dp, (xfs_fileoff_t)blkno, blkcnt,
+                                      &map, &nmap, XFS_BMAPI_ATTRFORK);
+-              if (error) {
++              if (error)
+                       return error;
+-              }
+               ASSERT(nmap == 1);
+-              ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+ 
+               /*
+-               * If it's a hole, these are already unmapped
+-               * so there's nothing to invalidate.
++               * Mark any incore buffers for the remote value as stale.  We
++               * never log remote attr value buffers, so the buffer should be
++               * easy to kill.
+                */
+-              if (map.br_startblock != HOLESTARTBLOCK) {
+-
+-                      dblkno = XFS_FSB_TO_DADDR(dp->i_mount,
+-                                                map.br_startblock);
+-                      dblkcnt = XFS_FSB_TO_BB(dp->i_mount,
+-                                              map.br_blockcount);
+-                      bp = xfs_trans_get_buf(*trans,
+-                                      dp->i_mount->m_ddev_targp,
+-                                      dblkno, dblkcnt, 0);
+-                      if (!bp)
+-                              return -ENOMEM;
+-                      xfs_trans_binval(*trans, bp);
+-                      /*
+-                       * Roll to next transaction.
+-                       */
+-                      error = xfs_trans_roll_inode(trans, dp);
+-                      if (error)
+-                              return error;
+-              }
++              error = xfs_attr_rmtval_stale(dp, &map, 0);
++              if (error)
++                      return error;
+ 
+-              tblkno += map.br_blockcount;
+-              tblkcnt -= map.br_blockcount;
++              blkno += map.br_blockcount;
++              blkcnt -= map.br_blockcount;
+       }
+ 
+       return 0;
+@@ -102,86 +79,45 @@ xfs_attr3_leaf_freextent(
+  */
+ STATIC int
+ xfs_attr3_leaf_inactive(
+-      struct xfs_trans        **trans,
+-      struct xfs_inode        *dp,
+-      struct xfs_buf          *bp)
++      struct xfs_trans                **trans,
++      struct xfs_inode                *dp,
++      struct xfs_buf                  *bp)
+ {
+-      struct xfs_attr_leafblock *leaf;
+-      struct xfs_attr3_icleaf_hdr ichdr;
+-      struct xfs_attr_leaf_entry *entry;
++      struct xfs_attr3_icleaf_hdr     ichdr;
++      struct xfs_mount                *mp = bp->b_mount;
++      struct xfs_attr_leafblock       *leaf = bp->b_addr;
++      struct xfs_attr_leaf_entry      *entry;
+       struct xfs_attr_leaf_name_remote *name_rmt;
+-      struct xfs_attr_inactive_list *list;
+-      struct xfs_attr_inactive_list *lp;
+-      int                     error;
+-      int                     count;
+-      int                     size;
+-      int                     tmp;
+-      int                     i;
+-      struct xfs_mount        *mp = bp->b_mount;
++      int                             error = 0;
++      int                             i;
+ 
+-      leaf = bp->b_addr;
+       xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
+ 
+       /*
+-       * Count the number of "remote" value extents.
++       * Find the remote value extents for this leaf and invalidate their
++       * incore buffers.
+        */
+-      count = 0;
+       entry = xfs_attr3_leaf_entryp(leaf);
+       for (i = 0; i < ichdr.count; entry++, i++) {
+-              if (be16_to_cpu(entry->nameidx) &&
+-                  ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
+-                      name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+-                      if (name_rmt->valueblk)
+-                              count++;
+-              }
+-      }
+-
+-      /*
+-       * If there are no "remote" values, we're done.
+-       */
+-      if (count == 0) {
+-              xfs_trans_brelse(*trans, bp);
+-              return 0;
+-      }
++              int             blkcnt;
+ 
+-      /*
+-       * Allocate storage for a list of all the "remote" value extents.
+-       */
+-      size = count * sizeof(xfs_attr_inactive_list_t);
+-      list = kmem_alloc(size, 0);
+-
+-      /*
+-       * Identify each of the "remote" value extents.
+-       */
+-      lp = list;
+-      entry = xfs_attr3_leaf_entryp(leaf);
+-      for (i = 0; i < ichdr.count; entry++, i++) {
+-              if (be16_to_cpu(entry->nameidx) &&
+-                  ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
+-                      name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+-                      if (name_rmt->valueblk) {
+-                              lp->valueblk = be32_to_cpu(name_rmt->valueblk);
+-                              lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount,
+-                                                  
be32_to_cpu(name_rmt->valuelen));
+-                              lp++;
+-                      }
+-              }
+-      }
+-      xfs_trans_brelse(*trans, bp);   /* unlock for trans. in freextent() */
++              if (!entry->nameidx || (entry->flags & XFS_ATTR_LOCAL))
++                      continue;
+ 
+-      /*
+-       * Invalidate each of the "remote" value extents.
+-       */
+-      error = 0;
+-      for (lp = list, i = 0; i < count; i++, lp++) {
+-              tmp = xfs_attr3_leaf_freextent(trans, dp,
+-                              lp->valueblk, lp->valuelen);
++              name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
++              if (!name_rmt->valueblk)
++                      continue;
+ 
+-              if (error == 0)
+-                      error = tmp;    /* save only the 1st errno */
++              blkcnt = xfs_attr3_rmt_blocks(dp->i_mount,
++                              be32_to_cpu(name_rmt->valuelen));
++              error = xfs_attr3_rmt_stale(dp,
++                              be32_to_cpu(name_rmt->valueblk), blkcnt);
++              if (error)
++                      goto err;
+       }
+ 
+-      kmem_free(list);
++      xfs_trans_brelse(*trans, bp);
++err:
+       return error;
+ }
+ 
+diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
+index 203065a647652..e41c13ffa5a43 100644
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -187,7 +187,12 @@ xfs_file_dio_aio_read(
+ 
+       file_accessed(iocb->ki_filp);
+ 
+-      xfs_ilock(ip, XFS_IOLOCK_SHARED);
++      if (iocb->ki_flags & IOCB_NOWAIT) {
++              if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
++                      return -EAGAIN;
++      } else {
++              xfs_ilock(ip, XFS_IOLOCK_SHARED);
++      }
+       ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
+       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ 
+diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
+index 7b72c189cff0b..30202d8c25e4f 100644
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -1513,10 +1513,8 @@ xfs_itruncate_extents_flags(
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_trans        *tp = *tpp;
+       xfs_fileoff_t           first_unmap_block;
+-      xfs_fileoff_t           last_block;
+       xfs_filblks_t           unmap_len;
+       int                     error = 0;
+-      int                     done = 0;
+ 
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+       ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
+@@ -1536,21 +1534,22 @@ xfs_itruncate_extents_flags(
+        * the end of the file (in a crash where the space is allocated
+        * but the inode size is not yet updated), simply remove any
+        * blocks which show up between the new EOF and the maximum
+-       * possible file size.  If the first block to be removed is
+-       * beyond the maximum file size (ie it is the same as last_block),
+-       * then there is nothing to do.
++       * possible file size.
++       *
++       * We have to free all the blocks to the bmbt maximum offset, even if
++       * the page cache can't scale that far.
+        */
+       first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
+-      last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
+-      if (first_unmap_block == last_block)
++      if (first_unmap_block >= XFS_MAX_FILEOFF) {
++              WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF);
+               return 0;
++      }
+ 
+-      ASSERT(first_unmap_block < last_block);
+-      unmap_len = last_block - first_unmap_block + 1;
+-      while (!done) {
++      unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1;
++      while (unmap_len > 0) {
+               ASSERT(tp->t_firstblock == NULLFSBLOCK);
+-              error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags,
+-                                  XFS_ITRUNC_MAX_EXTENTS, &done);
++              error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len,
++                              flags, XFS_ITRUNC_MAX_EXTENTS);
+               if (error)
+                       goto out;
+ 
+@@ -1570,7 +1569,7 @@ xfs_itruncate_extents_flags(
+       if (whichfork == XFS_DATA_FORK) {
+               /* Remove all pending CoW reservations. */
+               error = xfs_reflink_cancel_cow_blocks(ip, &tp,
+-                              first_unmap_block, last_block, true);
++                              first_unmap_block, XFS_MAX_FILEOFF, true);
+               if (error)
+                       goto out;
+ 
+diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
+index 904d8285c2269..dfbf3f8f1ec86 100644
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -1544,7 +1544,8 @@ xfs_reflink_clear_inode_flag(
+        * We didn't find any shared blocks so turn off the reflink flag.
+        * First, get rid of any leftover CoW mappings.
+        */
+-      error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true);
++      error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF,
++                      true);
+       if (error)
+               return error;
+ 
+diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
+index 8d1df9f8be071..a3a54a0fbffea 100644
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -512,32 +512,6 @@ xfs_showargs(
+               seq_puts(m, ",noquota");
+ }
+ 
+-static uint64_t
+-xfs_max_file_offset(
+-      unsigned int            blockshift)
+-{
+-      unsigned int            pagefactor = 1;
+-      unsigned int            bitshift = BITS_PER_LONG - 1;
+-
+-      /* Figure out maximum filesize, on Linux this can depend on
+-       * the filesystem blocksize (on 32 bit platforms).
+-       * __block_write_begin does this in an [unsigned] long long...
+-       *      page->index << (PAGE_SHIFT - bbits)
+-       * So, for page sized blocks (4K on 32 bit platforms),
+-       * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
+-       *      (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1)
+-       * but for smaller blocksizes it is less (bbits = log2 bsize).
+-       */
+-
+-#if BITS_PER_LONG == 32
+-      ASSERT(sizeof(sector_t) == 8);
+-      pagefactor = PAGE_SIZE;
+-      bitshift = BITS_PER_LONG;
+-#endif
+-
+-      return (((uint64_t)pagefactor) << bitshift) - 1;
+-}
+-
+ /*
+  * Set parameters for inode allocation heuristics, taking into account
+  * filesystem size and inode32/inode64 mount options; i.e. specifically
+@@ -1650,6 +1624,26 @@ xfs_fs_fill_super(
+       if (error)
+               goto out_free_sb;
+ 
++      /*
++       * XFS block mappings use 54 bits to store the logical block offset.
++       * This should suffice to handle the maximum file size that the VFS
++       * supports (currently 2^63 bytes on 64-bit and ULONG_MAX << PAGE_SHIFT
++       * bytes on 32-bit), but as XFS and VFS have gotten the s_maxbytes
++       * calculation wrong on 32-bit kernels in the past, we'll add a WARN_ON
++       * to check this assertion.
++       *
++       * Avoid integer overflow by comparing the maximum bmbt offset to the
++       * maximum pagecache offset in units of fs blocks.
++       */
++      if (XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE) > XFS_MAX_FILEOFF) {
++              xfs_warn(mp,
++"MAX_LFS_FILESIZE block offset (%llu) exceeds extent map maximum (%llu)!",
++                       XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE),
++                       XFS_MAX_FILEOFF);
++              error = -EINVAL;
++              goto out_free_sb;
++      }
++
+       error = xfs_filestream_mount(mp);
+       if (error)
+               goto out_free_sb;
+@@ -1661,7 +1655,7 @@ xfs_fs_fill_super(
+       sb->s_magic = XFS_SUPER_MAGIC;
+       sb->s_blocksize = mp->m_sb.sb_blocksize;
+       sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
+-      sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
++      sb->s_maxbytes = MAX_LFS_FILESIZE;
+       sb->s_max_links = XFS_MAXLINK;
+       sb->s_time_gran = 1;
+       sb->s_time_min = S32_MIN;
+diff --git a/include/linux/cpu.h b/include/linux/cpu.h
+index 29a6fa2f518db..b42e9c4134475 100644
+--- a/include/linux/cpu.h
++++ b/include/linux/cpu.h
+@@ -68,6 +68,8 @@ extern ssize_t cpu_show_srbds(struct device *dev, struct 
device_attribute *attr,
+ extern ssize_t cpu_show_mmio_stale_data(struct device *dev,
+                                       struct device_attribute *attr,
+                                       char *buf);
++extern ssize_t cpu_show_retbleed(struct device *dev,
++                               struct device_attribute *attr, char *buf);
+ 
+ extern __printf(4, 5)
+ struct device *cpu_device_create(struct device *parent, void *drvdata,
+diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
+index dd4cdad76b18e..ee7d57478a454 100644
+--- a/include/linux/kvm_host.h
++++ b/include/linux/kvm_host.h
+@@ -955,7 +955,7 @@ static inline void kvm_arch_end_assignment(struct kvm *kvm)
+ {
+ }
+ 
+-static inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
++static __always_inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
+ {
+       return false;
+ }
+diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
+index 4c56404e53a76..8265b99d6d55b 100644
+--- a/include/linux/mod_devicetable.h
++++ b/include/linux/mod_devicetable.h
+@@ -672,9 +672,7 @@ struct x86_cpu_id {
+       __u16 steppings;
+ };
+ 
+-#define X86_FEATURE_MATCH(x) \
+-      { X86_VENDOR_ANY, X86_FAMILY_ANY, X86_MODEL_ANY, x }
+-
++/* Wild cards for x86_cpu_id::vendor, family, model and feature */
+ #define X86_VENDOR_ANY 0xffff
+ #define X86_FAMILY_ANY 0
+ #define X86_MODEL_ANY  0
+diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn
+index 854e2ba9daa29..6a78afc6f13b4 100644
+--- a/scripts/Makefile.extrawarn
++++ b/scripts/Makefile.extrawarn
+@@ -50,6 +50,7 @@ KBUILD_CFLAGS += -Wno-sign-compare
+ KBUILD_CFLAGS += -Wno-format-zero-length
+ KBUILD_CFLAGS += $(call cc-disable-warning, pointer-to-enum-cast)
+ KBUILD_CFLAGS += $(call cc-disable-warning, unaligned-access)
++KBUILD_CFLAGS += $(call cc-disable-warning, cast-function-type-strict)
+ endif
+ 
+ endif
+diff --git a/tools/arch/x86/include/asm/cpufeatures.h 
b/tools/arch/x86/include/asm/cpufeatures.h
+index 59f924e92c284..3efaf338d3257 100644
+--- a/tools/arch/x86/include/asm/cpufeatures.h
++++ b/tools/arch/x86/include/asm/cpufeatures.h
+@@ -284,7 +284,7 @@
+ #define X86_FEATURE_CQM_MBM_LOCAL     (11*32+ 3) /* LLC Local MBM monitoring 
*/
+ #define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry 
SWAPGS path */
+ #define X86_FEATURE_FENCE_SWAPGS_KERNEL       (11*32+ 5) /* "" LFENCE in 
kernel entry SWAPGS path */
+-#define X86_FEATURE_RSB_VMEXIT_LITE   (11*32+ 6) /* "" Fill RSB on VM-Exit 
when EIBRS is enabled */
++#define X86_FEATURE_RSB_VMEXIT_LITE   (11*32+17) /* "" Fill RSB on VM-Exit 
when EIBRS is enabled */
+ 
+ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
+ #define X86_FEATURE_AVX512_BF16               (12*32+ 5) /* AVX512 BFLOAT16 
instructions */

[gentoo-commits] proj/linux-patches:5.4 commit in: /

Reply via email to