[tip: x86/cpu] x86/mce: Add Xeon Sapphire Rapids to list of CPUs that support PPIN

2021-03-20 Thread tip-bot2 for Tony Luck
The following commit has been merged into the x86/cpu branch of tip:

Commit-ID: a331f5fdd36dba1ffb0239a4dfaaf1df91ff1aab
Gitweb:
https://git.kernel.org/tip/a331f5fdd36dba1ffb0239a4dfaaf1df91ff1aab
Author:Tony Luck 
AuthorDate:Fri, 19 Mar 2021 10:39:19 -07:00
Committer: Ingo Molnar 
CommitterDate: Sat, 20 Mar 2021 12:12:10 +01:00

x86/mce: Add Xeon Sapphire Rapids to list of CPUs that support PPIN

New CPU model, same MSRs to control and read the inventory number.

Signed-off-by: Tony Luck 
Signed-off-by: Ingo Molnar 
Link: https://lore.kernel.org/r/20210319173919.291428-1-tony.l...@intel.com
---
 arch/x86/kernel/cpu/mce/intel.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index e309476..acfd5d9 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -486,6 +486,7 @@ static void intel_ppin_init(struct cpuinfo_x86 *c)
case INTEL_FAM6_BROADWELL_X:
case INTEL_FAM6_SKYLAKE_X:
case INTEL_FAM6_ICELAKE_X:
+   case INTEL_FAM6_SAPPHIRERAPIDS_X:
case INTEL_FAM6_XEON_PHI_KNL:
case INTEL_FAM6_XEON_PHI_KNM:
 


[tip: ras/core] x86/mce: Use "safe" MSR functions when enabling additional error logging

2020-11-16 Thread tip-bot2 for Tony Luck
The following commit has been merged into the ras/core branch of tip:

Commit-ID: 098416e6986127f7e4c8ce4fd6bbbd80e55b0386
Gitweb:
https://git.kernel.org/tip/098416e6986127f7e4c8ce4fd6bbbd80e55b0386
Author:Tony Luck 
AuthorDate:Tue, 10 Nov 2020 16:39:54 -08:00
Committer: Borislav Petkov 
CommitterDate: Mon, 16 Nov 2020 17:34:08 +01:00

x86/mce: Use "safe" MSR functions when enabling additional error logging

Booting as a guest under KVM results in error messages about
unchecked MSR access:

  unchecked MSR access error: RDMSR from 0x17f at rIP: 0x84483f16 
(mce_intel_feature_init+0x156/0x270)

because KVM doesn't provide emulation for random model specific
registers.

Switch to using rdmsrl_safe()/wrmsrl_safe() to avoid the message.

Fixes: 68299a42f842 ("x86/mce: Enable additional error logging on certain Intel 
CPUs")
Reported-by: Qian Cai 
Signed-off-by: Tony Luck 
Signed-off-by: Borislav Petkov 
Link: 
https://lkml.kernel.org/r/2020003954.ga11...@agluck-desk2.amr.corp.intel.com
---
 arch/x86/kernel/cpu/mce/intel.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index b47883e..c2476fe 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -521,9 +521,10 @@ static void intel_imc_init(struct cpuinfo_x86 *c)
case INTEL_FAM6_SANDYBRIDGE_X:
case INTEL_FAM6_IVYBRIDGE_X:
case INTEL_FAM6_HASWELL_X:
-   rdmsrl(MSR_ERROR_CONTROL, error_control);
+   if (rdmsrl_safe(MSR_ERROR_CONTROL, _control))
+   return;
error_control |= 2;
-   wrmsrl(MSR_ERROR_CONTROL, error_control);
+   wrmsrl_safe(MSR_ERROR_CONTROL, error_control);
break;
}
 }


[tip: ras/core] x86/mce: Enable additional error logging on certain Intel CPUs

2020-11-02 Thread tip-bot2 for Tony Luck
The following commit has been merged into the ras/core branch of tip:

Commit-ID: 68299a42f84288537ee3420c431ac0115ccb90b1
Gitweb:
https://git.kernel.org/tip/68299a42f84288537ee3420c431ac0115ccb90b1
Author:Tony Luck 
AuthorDate:Fri, 30 Oct 2020 12:04:00 -07:00
Committer: Borislav Petkov 
CommitterDate: Mon, 02 Nov 2020 11:15:59 +01:00

x86/mce: Enable additional error logging on certain Intel CPUs

The Xeon versions of Sandy Bridge, Ivy Bridge and Haswell support an
optional additional error logging mode which is enabled by an MSR.

Previously, this mode was enabled from the mcelog(8) tool via /dev/cpu,
but userspace should not be poking at MSRs. So move the enabling into
the kernel.

 [ bp: Correct the explanation why this is done. ]

Suggested-by: Boris Petkov 
Signed-off-by: Tony Luck 
Signed-off-by: Borislav Petkov 
Link: 
https://lkml.kernel.org/r/20201030190807.ga13...@agluck-desk2.amr.corp.intel.com
---
 arch/x86/include/asm/msr-index.h |  1 +
 arch/x86/kernel/cpu/mce/intel.c  | 20 
 2 files changed, 21 insertions(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 972a34d..b2dd264 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -139,6 +139,7 @@
 #define MSR_IA32_MCG_CAP   0x0179
 #define MSR_IA32_MCG_STATUS0x017a
 #define MSR_IA32_MCG_CTL   0x017b
+#define MSR_ERROR_CONTROL  0x017f
 #define MSR_IA32_MCG_EXT_CTL   0x04d0
 
 #define MSR_OFFCORE_RSP_0  0x01a6
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index abe9fe0..b47883e 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -509,12 +509,32 @@ static void intel_ppin_init(struct cpuinfo_x86 *c)
}
 }
 
+/*
+ * Enable additional error logs from the integrated
+ * memory controller on processors that support this.
+ */
+static void intel_imc_init(struct cpuinfo_x86 *c)
+{
+   u64 error_control;
+
+   switch (c->x86_model) {
+   case INTEL_FAM6_SANDYBRIDGE_X:
+   case INTEL_FAM6_IVYBRIDGE_X:
+   case INTEL_FAM6_HASWELL_X:
+   rdmsrl(MSR_ERROR_CONTROL, error_control);
+   error_control |= 2;
+   wrmsrl(MSR_ERROR_CONTROL, error_control);
+   break;
+   }
+}
+
 void mce_intel_feature_init(struct cpuinfo_x86 *c)
 {
intel_init_thermal(c);
intel_init_cmci();
intel_init_lmce();
intel_ppin_init(c);
+   intel_imc_init(c);
 }
 
 void mce_intel_feature_clear(struct cpuinfo_x86 *c)


[tip: ras/core] x86/mce: Recover from poison found while copying from user space

2020-10-07 Thread tip-bot2 for Tony Luck
The following commit has been merged into the ras/core branch of tip:

Commit-ID: c0ab7ffce275d3f83bd253c70889c28821d4a41d
Gitweb:
https://git.kernel.org/tip/c0ab7ffce275d3f83bd253c70889c28821d4a41d
Author:Tony Luck 
AuthorDate:Tue, 06 Oct 2020 14:09:09 -07:00
Committer: Borislav Petkov 
CommitterDate: Wed, 07 Oct 2020 11:29:41 +02:00

x86/mce: Recover from poison found while copying from user space

Existing kernel code can only recover from a machine check on code that
is tagged in the exception table with a fault handling recovery path.

Add two new fields in the task structure to pass information from
machine check handler to the "task_work" that is queued to run before
the task returns to user mode:

+ mce_vaddr: will be initialized to the user virtual address of the fault
  in the case where the fault occurred in the kernel copying data from
  a user address.  This is so that kill_me_maybe() can provide that
  information to the user SIGBUS handler.

+ mce_kflags: copy of the struct mce.kflags needed by kill_me_maybe()
  to determine if mce_vaddr is applicable to this error.

Add code to recover from a machine check while copying data from user
space to the kernel. Action for this case is the same as if the user
touched the poison directly; unmap the page and send a SIGBUS to the task.

Use a new helper function to share common code between the "fault
in user mode" case and the "fault while copying from user" case.

New code paths will be activated by the next patch which sets
MCE_IN_KERNEL_COPYIN.

Suggested-by: Borislav Petkov 
Signed-off-by: Tony Luck 
Signed-off-by: Borislav Petkov 
Link: https://lkml.kernel.org/r/20201006210910.21062-6-tony.l...@intel.com
---
 arch/x86/kernel/cpu/mce/core.c | 27 ---
 include/linux/sched.h  |  2 ++
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 2d6caf0..5c423c4 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1260,6 +1260,21 @@ static void kill_me_maybe(struct callback_head *cb)
kill_me_now(cb);
 }
 
+static void queue_task_work(struct mce *m, int kill_it)
+{
+   current->mce_addr = m->addr;
+   current->mce_kflags = m->kflags;
+   current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
+   current->mce_whole_page = whole_page(m);
+
+   if (kill_it)
+   current->mce_kill_me.func = kill_me_now;
+   else
+   current->mce_kill_me.func = kill_me_maybe;
+
+   task_work_add(current, >mce_kill_me, true);
+}
+
 /*
  * The actual machine check handler. This only handles real
  * exceptions when something got corrupted coming in through int 18.
@@ -1401,13 +1416,8 @@ noinstr void do_machine_check(struct pt_regs *regs)
/* If this triggers there is no way to recover. Die hard. */
BUG_ON(!on_thread_stack() || !user_mode(regs));
 
-   current->mce_addr = m.addr;
-   current->mce_ripv = !!(m.mcgstatus & MCG_STATUS_RIPV);
-   current->mce_whole_page = whole_page();
-   current->mce_kill_me.func = kill_me_maybe;
-   if (kill_it)
-   current->mce_kill_me.func = kill_me_now;
-   task_work_add(current, >mce_kill_me, true);
+   queue_task_work(, kill_it);
+
} else {
/*
 * Handle an MCE which has happened in kernel space but from
@@ -1422,6 +1432,9 @@ noinstr void do_machine_check(struct pt_regs *regs)
if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
mce_panic("Failed kernel mode recovery", , 
msg);
}
+
+   if (m.kflags & MCE_IN_KERNEL_COPYIN)
+   queue_task_work(, kill_it);
}
 out:
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 93ecd93..2cbba3e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1308,6 +1308,8 @@ struct task_struct {
 #endif
 
 #ifdef CONFIG_X86_MCE
+   void __user *mce_vaddr;
+   __u64   mce_kflags;
u64 mce_addr;
__u64   mce_ripv : 1,
mce_whole_page : 1,


[tip: ras/core] x86/mce: Provide method to find out the type of an exception handler

2020-10-07 Thread tip-bot2 for Tony Luck
The following commit has been merged into the ras/core branch of tip:

Commit-ID: a05d54c41ecfa1a322b229b4e5ce50c157284f74
Gitweb:
https://git.kernel.org/tip/a05d54c41ecfa1a322b229b4e5ce50c157284f74
Author:Tony Luck 
AuthorDate:Tue, 06 Oct 2020 14:09:06 -07:00
Committer: Borislav Petkov 
CommitterDate: Wed, 07 Oct 2020 11:08:59 +02:00

x86/mce: Provide method to find out the type of an exception handler

Avoid a proliferation of ex_has_*_handler() functions by having just
one function that returns the type of the handler (if any).

Drop the __visible attribute for this function. It is not called
from assembler so the attribute is not necessary.

Signed-off-by: Tony Luck 
Signed-off-by: Borislav Petkov 
Link: https://lkml.kernel.org/r/20201006210910.21062-3-tony.l...@intel.com
---
 arch/x86/include/asm/extable.h |  9 -
 arch/x86/kernel/cpu/mce/severity.c |  5 -
 arch/x86/mm/extable.c  | 12 
 3 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h
index d8c2198..1f0cbc5 100644
--- a/arch/x86/include/asm/extable.h
+++ b/arch/x86/include/asm/extable.h
@@ -29,10 +29,17 @@ struct pt_regs;
(b)->handler = (tmp).handler - (delta); \
} while (0)
 
+enum handler_type {
+   EX_HANDLER_NONE,
+   EX_HANDLER_FAULT,
+   EX_HANDLER_UACCESS,
+   EX_HANDLER_OTHER
+};
+
 extern int fixup_exception(struct pt_regs *regs, int trapnr,
   unsigned long error_code, unsigned long fault_addr);
 extern int fixup_bug(struct pt_regs *regs, int trapnr);
-extern bool ex_has_fault_handler(unsigned long ip);
+extern enum handler_type ex_get_fault_handler_type(unsigned long ip);
 extern void early_fixup_exception(struct pt_regs *regs, int trapnr);
 
 #endif
diff --git a/arch/x86/kernel/cpu/mce/severity.c 
b/arch/x86/kernel/cpu/mce/severity.c
index 0b072dc..c6494e6 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -225,10 +225,13 @@ static struct severity {
  */
 static int error_context(struct mce *m, struct pt_regs *regs)
 {
+   enum handler_type t;
+
if ((m->cs & 3) == 3)
return IN_USER;
 
-   if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip)) {
+   t = ex_get_fault_handler_type(m->ip);
+   if (mc_recoverable(m->mcgstatus) && t == EX_HANDLER_FAULT) {
m->kflags |= MCE_IN_KERNEL_RECOV;
return IN_KERNEL_RECOV;
}
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 1d6cb07..de43525 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -125,17 +125,21 @@ __visible bool ex_handler_clear_fs(const struct 
exception_table_entry *fixup,
 }
 EXPORT_SYMBOL(ex_handler_clear_fs);
 
-__visible bool ex_has_fault_handler(unsigned long ip)
+enum handler_type ex_get_fault_handler_type(unsigned long ip)
 {
const struct exception_table_entry *e;
ex_handler_t handler;
 
e = search_exception_tables(ip);
if (!e)
-   return false;
+   return EX_HANDLER_NONE;
handler = ex_fixup_handler(e);
-
-   return handler == ex_handler_fault;
+   if (handler == ex_handler_fault)
+   return EX_HANDLER_FAULT;
+   else if (handler == ex_handler_uaccess)
+   return EX_HANDLER_UACCESS;
+   else
+   return EX_HANDLER_OTHER;
 }
 
 int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,


[tip: ras/core] x86/mce: Avoid tail copy when machine check terminated a copy from user

2020-10-07 Thread tip-bot2 for Tony Luck
The following commit has been merged into the ras/core branch of tip:

Commit-ID: a2f73400e4dfd13f673c6e1b4b98d180fd1e47b3
Gitweb:
https://git.kernel.org/tip/a2f73400e4dfd13f673c6e1b4b98d180fd1e47b3
Author:Tony Luck 
AuthorDate:Tue, 06 Oct 2020 14:09:08 -07:00
Committer: Borislav Petkov 
CommitterDate: Wed, 07 Oct 2020 11:26:56 +02:00

x86/mce: Avoid tail copy when machine check terminated a copy from user

In the page fault case it is ok to see if a few more unaligned bytes
can be copied from the source address. Worst case is that the page fault
will be triggered again.

Machine checks are more serious. Just give up at the point where the
main copy loop triggered the #MC and return from the copy code as if
the copy succeeded. The machine check handler will use task_work_add() to
make sure that the task is sent a SIGBUS.

Signed-off-by: Tony Luck 
Signed-off-by: Borislav Petkov 
Link: https://lkml.kernel.org/r/20201006210910.21062-5-tony.l...@intel.com
---
 arch/x86/lib/copy_user_64.S | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 5b68e94..77b9b2a 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 
 .macro ALIGN_DESTINATION
/* check for bad alignment of destination */
@@ -221,6 +222,7 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
  * Try to copy last bytes and clear the rest if needed.
  * Since protection fault in copy_from/to_user is not a normal situation,
  * it is not necessary to optimize tail handling.
+ * Don't try to copy the tail if machine check happened
  *
  * Input:
  * rdi destination
@@ -232,11 +234,24 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
  */
 SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
movl %edx,%ecx
+   cmp $X86_TRAP_MC,%eax   /* check if X86_TRAP_MC */
+   je 3f
 1: rep movsb
 2: mov %ecx,%eax
ASM_CLAC
ret
 
+   /*
+* Return zero to pretend that this copy succeeded. This
+* is counter-intuitive, but needed to prevent the code
+* in lib/iov_iter.c from retrying and running back into
+* the poison cache line again. The machine check handler
+* will ensure that a SIGBUS is sent to the task.
+*/
+3: xorl %eax,%eax
+   ASM_CLAC
+   ret
+
_ASM_EXTABLE_CPY(1b, 2b)
 SYM_CODE_END(.Lcopy_user_handle_tail)
 


[tip: ras/core] x86/mce: Decode a kernel instruction to determine if it is copying from user

2020-10-07 Thread tip-bot2 for Tony Luck
The following commit has been merged into the ras/core branch of tip:

Commit-ID: 300638101329e8f1569115f3d7197ef5ef754a3a
Gitweb:
https://git.kernel.org/tip/300638101329e8f1569115f3d7197ef5ef754a3a
Author:Tony Luck 
AuthorDate:Tue, 06 Oct 2020 14:09:10 -07:00
Committer: Borislav Petkov 
CommitterDate: Wed, 07 Oct 2020 11:32:40 +02:00

x86/mce: Decode a kernel instruction to determine if it is copying from user

All instructions copying data between kernel and user memory
are tagged with either _ASM_EXTABLE_UA or _ASM_EXTABLE_CPY
entries in the exception table. ex_fault_handler_type() returns
EX_HANDLER_UACCESS for both of these.

Recovery is only possible when the machine check was triggered
on a read from user memory. In this case the same strategy for
recovery applies as if the user had made the access in ring3. If
the fault was in kernel memory while copying to user there is no
current recovery plan.

For MOV and MOVZ instructions a full decode of the instruction
is done to find the source address. For MOVS instructions
the source address is in the %rsi register. The function
fault_in_kernel_space() determines whether the source address is
kernel or user, upgrade it from "static" so it can be used here.

Co-developed-by: Youquan Song 
Signed-off-by: Youquan Song 
Signed-off-by: Tony Luck 
Signed-off-by: Borislav Petkov 
Link: https://lkml.kernel.org/r/20201006210910.21062-7-tony.l...@intel.com
---
 arch/x86/include/asm/traps.h   |  2 +-
 arch/x86/kernel/cpu/mce/core.c | 11 --
 arch/x86/kernel/cpu/mce/severity.c | 53 -
 arch/x86/mm/fault.c|  2 +-
 4 files changed, 63 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 714b1a3..df0b7bf 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -35,6 +35,8 @@ extern int panic_on_unrecovered_nmi;
 
 void math_emulate(struct math_emu_info *);
 
+bool fault_in_kernel_space(unsigned long address);
+
 #ifdef CONFIG_VMAP_STACK
 void __noreturn handle_stack_overflow(const char *message,
  struct pt_regs *regs,
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 5c423c4..3d6e1bf 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1250,14 +1250,19 @@ static void kill_me_maybe(struct callback_head *cb)
if (!p->mce_ripv)
flags |= MF_MUST_KILL;
 
-   if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) {
+   if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) &&
+   !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
sync_core();
return;
}
 
-   pr_err("Memory error not recovered");
-   kill_me_now(cb);
+   if (p->mce_vaddr != (void __user *)-1l) {
+   force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT);
+   } else {
+   pr_err("Memory error not recovered");
+   kill_me_now(cb);
+   }
 }
 
 static void queue_task_work(struct mce *m, int kill_it)
diff --git a/arch/x86/kernel/cpu/mce/severity.c 
b/arch/x86/kernel/cpu/mce/severity.c
index c6494e6..83df991 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -13,6 +13,9 @@
 
 #include 
 #include 
+#include 
+#include 
+#include 
 
 #include "internal.h"
 
@@ -212,6 +215,47 @@ static struct severity {
 #define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
(MCG_STATUS_RIPV|MCG_STATUS_EIPV))
 
+static bool is_copy_from_user(struct pt_regs *regs)
+{
+   u8 insn_buf[MAX_INSN_SIZE];
+   struct insn insn;
+   unsigned long addr;
+
+   if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE))
+   return false;
+
+   kernel_insn_init(, insn_buf, MAX_INSN_SIZE);
+   insn_get_opcode();
+   if (!insn.opcode.got)
+   return false;
+
+   switch (insn.opcode.value) {
+   /* MOV mem,reg */
+   case 0x8A: case 0x8B:
+   /* MOVZ mem,reg */
+   case 0xB60F: case 0xB70F:
+   insn_get_modrm();
+   insn_get_sib();
+   if (!insn.modrm.got || !insn.sib.got)
+   return false;
+   addr = (unsigned long)insn_get_addr_ref(, regs);
+   break;
+   /* REP MOVS */
+   case 0xA4: case 0xA5:
+   addr = regs->si;
+   break;
+   default:
+   return false;
+   }
+
+   if (fault_in_kernel_space(addr))
+   return false;
+
+   current->mce_vaddr = (void __user *)addr;
+
+   return true;
+}
+
 /*
  * If mcgstatus indicated that ip/cs on the stack were
  * no good, then "m->cs" will be zero and we will have
@@ -229,10 +273,17 @@ static int error_context(struct 

[tip: ras/core] x86/mce: Drop AMD-specific "DEFERRED" case from Intel severity rule list

2020-09-29 Thread tip-bot2 for Tony Luck
The following commit has been merged into the ras/core branch of tip:

Commit-ID: ed9705e4ad1c19ae51ed0cb4c112f9eb6dfc69fc
Gitweb:
https://git.kernel.org/tip/ed9705e4ad1c19ae51ed0cb4c112f9eb6dfc69fc
Author:Tony Luck 
AuthorDate:Tue, 29 Sep 2020 19:13:13 -07:00
Committer: Borislav Petkov 
CommitterDate: Wed, 30 Sep 2020 07:49:58 +02:00

x86/mce: Drop AMD-specific "DEFERRED" case from Intel severity rule list

Way back in v3.19 Intel and AMD shared the same machine check severity
grading code. So it made sense to add a case for AMD DEFERRED errors in
commit

  e3480271f592 ("x86, mce, severity: Extend the the mce_severity mechanism to 
handle UCNA/DEFERRED error")

But later in v4.2 AMD switched to a separate grading function in
commit

  bf80bbd7dcf5 ("x86/mce: Add an AMD severities-grading function")

Belatedly drop the DEFERRED case from the Intel rule list.

Signed-off-by: Tony Luck 
Signed-off-by: Borislav Petkov 
Link: https://lkml.kernel.org/r/20200930021313.31810-3-tony.l...@intel.com
---
 arch/x86/kernel/cpu/mce/severity.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/severity.c 
b/arch/x86/kernel/cpu/mce/severity.c
index 567ce09..e072246 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -97,10 +97,6 @@ static struct severity {
EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
),
MCESEV(
-   DEFERRED, "Deferred error",
-   NOSER, 
MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
-   ),
-   MCESEV(
KEEP, "Corrected error",
NOSER, BITCLR(MCI_STATUS_UC)
),


[tip: ras/core] x86/mce: Stop mce_reign() from re-computing severity for every CPU

2020-09-14 Thread tip-bot2 for Tony Luck
The following commit has been merged into the ras/core branch of tip:

Commit-ID: 13c877f4b48b943105ad9e13ba2c7a093fb694e8
Gitweb:
https://git.kernel.org/tip/13c877f4b48b943105ad9e13ba2c7a093fb694e8
Author:Tony Luck 
AuthorDate:Tue, 08 Sep 2020 10:55:12 -07:00
Committer: Borislav Petkov 
CommitterDate: Mon, 14 Sep 2020 19:25:23 +02:00

x86/mce: Stop mce_reign() from re-computing severity for every CPU

Back in commit:

  20d51a426fe9 ("x86/mce: Reuse one of the u16 padding fields in 'struct mce'")

a field was added to "struct mce" to save the computed error severity.

Make use of this in mce_reign() to avoid re-computing the severity
for every CPU.

In the case where the machine panics, one call to mce_severity() is
still needed in order to provide the correct message giving the reason
for the panic.

Signed-off-by: Tony Luck 
Signed-off-by: Borislav Petkov 
Link: https://lkml.kernel.org/r/20200908175519.14223-2-tony.l...@intel.com
---
 arch/x86/kernel/cpu/mce/core.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index a697bae..5b1d5f3 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -920,7 +920,6 @@ static void mce_reign(void)
struct mce *m = NULL;
int global_worst = 0;
char *msg = NULL;
-   char *nmsg = NULL;
 
/*
 * This CPU is the Monarch and the other CPUs have run
@@ -928,12 +927,10 @@ static void mce_reign(void)
 * Grade the severity of the errors of all the CPUs.
 */
for_each_possible_cpu(cpu) {
-   int severity = mce_severity(_cpu(mces_seen, cpu),
-   mca_cfg.tolerant,
-   , true);
-   if (severity > global_worst) {
-   msg = nmsg;
-   global_worst = severity;
+   struct mce *mtmp = _cpu(mces_seen, cpu);
+
+   if (mtmp->severity > global_worst) {
+   global_worst = mtmp->severity;
m = _cpu(mces_seen, cpu);
}
}
@@ -943,8 +940,11 @@ static void mce_reign(void)
 * This dumps all the mces in the log buffer and stops the
 * other CPUs.
 */
-   if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
+   if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
+   /* call mce_severity() to get "msg" for panic */
+   mce_severity(m, mca_cfg.tolerant, , true);
mce_panic("Fatal machine check", m, msg);
+   }
 
/*
 * For UC somewhere we let the CPU who detects it handle it.


[tip: ras/core] x86/mce: Delay clearing IA32_MCG_STATUS to the end of do_machine_check()

2020-08-26 Thread tip-bot2 for Tony Luck
The following commit has been merged into the ras/core branch of tip:

Commit-ID: 1e36d9c6886849c6f3d3c836370563e6bc1a6ddd
Gitweb:
https://git.kernel.org/tip/1e36d9c6886849c6f3d3c836370563e6bc1a6ddd
Author:Tony Luck 
AuthorDate:Mon, 24 Aug 2020 15:12:37 -07:00
Committer: Borislav Petkov 
CommitterDate: Wed, 26 Aug 2020 18:40:18 +02:00

x86/mce: Delay clearing IA32_MCG_STATUS to the end of do_machine_check()

A long time ago, Linux cleared IA32_MCG_STATUS at the very end of machine
check processing.

Then, some fancy recovery and IST manipulation was added in:

  d4812e169de4 ("x86, mce: Get rid of TIF_MCE_NOTIFY and associated mce tricks")

and clearing IA32_MCG_STATUS was pulled earlier in the function.

Next change moved the actual recovery out of do_machine_check() and
just used task_work_add() to schedule it later (before returning to the
user):

  5567d11c21a1 ("x86/mce: Send #MC singal from task work")

Most recently the fancy IST footwork was removed as no longer needed:

  b052df3da821 ("x86/entry: Get rid of ist_begin/end_non_atomic()")

At this point there is no reason remaining to clear IA32_MCG_STATUS early.
It can move back to the very end of the function.

Also move sync_core(). The comments for this function say that it should
only be called when instructions have been changed/re-mapped. Recovery
for an instruction fetch may change the physical address. But that
doesn't happen until the scheduled work runs (which could be on another
CPU).

 [ bp: Massage commit message. ]

Reported-by: Gabriele Paoloni 
Signed-off-by: Tony Luck 
Signed-off-by: Borislav Petkov 
Link: https://lkml.kernel.org/r/20200824221237.5397-1-tony.l...@intel.com
---
 arch/x86/kernel/cpu/mce/core.c |  9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index f43a78b..0ba24df 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1190,6 +1190,7 @@ static void kill_me_maybe(struct callback_head *cb)
 
if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) {
set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
+   sync_core();
return;
}
 
@@ -1330,12 +1331,8 @@ noinstr void do_machine_check(struct pt_regs *regs)
if (worst > 0)
irq_work_queue(_irq_work);
 
-   mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
-
-   sync_core();
-
if (worst != MCE_AR_SEVERITY && !kill_it)
-   return;
+   goto out;
 
/* Fault was in user mode and we need to take some action */
if ((m.cs & 3) == 3) {
@@ -1364,6 +1361,8 @@ noinstr void do_machine_check(struct pt_regs *regs)
mce_panic("Failed kernel mode recovery", , 
msg);
}
}
+out:
+   mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
 


[tip: x86/cpu] x86/cpu: Add Lakefield, Alder Lake and Rocket Lake models to the to Intel CPU family

2020-07-25 Thread tip-bot2 for Tony Luck
The following commit has been merged into the x86/cpu branch of tip:

Commit-ID: e00b62f0b06d0ae2b844049f216807617aff0cdb
Gitweb:
https://git.kernel.org/tip/e00b62f0b06d0ae2b844049f216807617aff0cdb
Author:Tony Luck 
AuthorDate:Mon, 20 Jul 2020 21:37:49 -07:00
Committer: Ingo Molnar 
CommitterDate: Sat, 25 Jul 2020 12:16:59 +02:00

x86/cpu: Add Lakefield, Alder Lake and Rocket Lake models to the to Intel CPU 
family

Add three new Intel CPU models.

Signed-off-by: Tony Luck 
Signed-off-by: Ingo Molnar 
Link: https://lore.kernel.org/r/20200721043749.31567-1-tony.l...@intel.com
---
 arch/x86/include/asm/intel-family.h | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/arch/x86/include/asm/intel-family.h 
b/arch/x86/include/asm/intel-family.h
index a338a6d..5e658ba 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -89,8 +89,15 @@
 #define INTEL_FAM6_COMETLAKE   0xA5
 #define INTEL_FAM6_COMETLAKE_L 0xA6
 
+#define INTEL_FAM6_ROCKETLAKE  0xA7
+
 #define INTEL_FAM6_SAPPHIRERAPIDS_X0x8F
 
+/* Hybrid Core/Atom Processors */
+
+#defineINTEL_FAM6_LAKEFIELD0x8A
+#define INTEL_FAM6_ALDERLAKE   0x97
+
 /* "Small Core" Processors (Atom) */
 
 #define INTEL_FAM6_ATOM_BONNELL0x1C /* Diamondville, Pineview 
*/


[tip: efi/urgent] efivarfs: Don't return -EINTR when rate-limiting reads

2020-06-19 Thread tip-bot2 for Tony Luck
The following commit has been merged into the efi/urgent branch of tip:

Commit-ID: 4353f03317fd3eb0bd803b61bdb287b68736a728
Gitweb:
https://git.kernel.org/tip/4353f03317fd3eb0bd803b61bdb287b68736a728
Author:Tony Luck 
AuthorDate:Thu, 28 May 2020 12:49:05 -07:00
Committer: Ard Biesheuvel 
CommitterDate: Mon, 15 Jun 2020 14:38:56 +02:00

efivarfs: Don't return -EINTR when rate-limiting reads

Applications that read EFI variables may see a return
value of -EINTR if they exceed the rate limit and a
signal delivery is attempted while the process is sleeping.

This is quite surprising to the application, which probably
doesn't have code to handle it.

Change the interruptible sleep to a non-interruptible one.

Reported-by: Lennart Poettering 
Signed-off-by: Tony Luck 
Link: https://lore.kernel.org/r/20200528194905.690-3-tony.l...@intel.com
Signed-off-by: Ard Biesheuvel 
---
 fs/efivarfs/file.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 4b8bc45..feaa5e1 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -73,10 +73,8 @@ static ssize_t efivarfs_file_read(struct file *file, char 
__user *userbuf,
ssize_t size = 0;
int err;
 
-   while (!__ratelimit(>f_cred->user->ratelimit)) {
-   if (!msleep_interruptible(50))
-   return -EINTR;
-   }
+   while (!__ratelimit(>f_cred->user->ratelimit))
+   msleep(50);
 
err = efivar_entry_size(var, );
 


[tip: efi/urgent] efivarfs: Update inode modification time for successful writes

2020-06-19 Thread tip-bot2 for Tony Luck
The following commit has been merged into the efi/urgent branch of tip:

Commit-ID: 2096721f1577b51b574fa06a7d91823dffe7267a
Gitweb:
https://git.kernel.org/tip/2096721f1577b51b574fa06a7d91823dffe7267a
Author:Tony Luck 
AuthorDate:Thu, 28 May 2020 12:49:04 -07:00
Committer: Ard Biesheuvel 
CommitterDate: Mon, 15 Jun 2020 14:38:56 +02:00

efivarfs: Update inode modification time for successful writes

Some applications want to be able to see when EFI variables
have been updated.

Update the modification time for successful writes.

Reported-by: Lennart Poettering 
Signed-off-by: Tony Luck 
Link: https://lore.kernel.org/r/20200528194905.690-2-tony.l...@intel.com
Signed-off-by: Ard Biesheuvel 
---
 fs/efivarfs/file.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index e9e27a2..4b8bc45 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -51,6 +51,7 @@ static ssize_t efivarfs_file_write(struct file *file,
} else {
inode_lock(inode);
i_size_write(inode, datasize + sizeof(attributes));
+   inode->i_mtime = current_time(inode);
inode_unlock(inode);
}
 


[tip: x86/fsgsbase] x86/speculation/swapgs: Check FSGSBASE in enabling SWAPGS mitigation

2020-06-18 Thread tip-bot2 for Tony Luck
The following commit has been merged into the x86/fsgsbase branch of tip:

Commit-ID: 978e1342c3c4d7b20808fd5875d9ac0d57db22ee
Gitweb:
https://git.kernel.org/tip/978e1342c3c4d7b20808fd5875d9ac0d57db22ee
Author:Tony Luck 
AuthorDate:Thu, 28 May 2020 16:13:54 -04:00
Committer: Thomas Gleixner 
CommitterDate: Thu, 18 Jun 2020 15:47:02 +02:00

x86/speculation/swapgs: Check FSGSBASE in enabling SWAPGS mitigation

Before enabling FSGSBASE the kernel could safely assume that the content
of GS base was a user address. Thus any speculative access as the result
of a mispredicted branch controlling the execution of SWAPGS would be to
a user address. So systems with speculation-proof SMAP did not need to
add additional LFENCE instructions to mitigate.

With FSGSBASE enabled a hostile user can set GS base to a kernel address.
So they can make the kernel speculatively access data they wish to leak
via a side channel. This means that SMAP provides no protection.

Add FSGSBASE as an additional condition to enable the fence-based SWAPGS
mitigation.

Signed-off-by: Tony Luck 
Signed-off-by: Chang S. Bae 
Signed-off-by: Sasha Levin 
Signed-off-by: Thomas Gleixner 
Link: https://lkml.kernel.org/r/20200528201402.1708239-9-sas...@kernel.org


---
 arch/x86/kernel/cpu/bugs.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 0b71970..5ea5fbd 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -543,14 +543,12 @@ static void __init spectre_v1_select_mitigation(void)
 * If FSGSBASE is enabled, the user can put a kernel address in
 * GS, in which case SMAP provides no protection.
 *
-* [ NOTE: Don't check for X86_FEATURE_FSGSBASE until the
-* FSGSBASE enablement patches have been merged. ]
-*
 * If FSGSBASE is disabled, the user can only put a user space
 * address in GS.  That makes an attack harder, but still
 * possible if there's no SMAP protection.
 */
-   if (!smap_works_speculatively()) {
+   if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
+   !smap_works_speculatively()) {
/*
 * Mitigation can be provided from SWAPGS itself or
 * PTI as the CR3 write in the Meltdown mitigation


[tip: x86/urgent] x86/cpu: Add Sapphire Rapids CPU model number

2020-06-03 Thread tip-bot2 for Tony Luck
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: be25d1b5ea6a3a3ecbb5474e2ae8e32d2ba055ea
Gitweb:
https://git.kernel.org/tip/be25d1b5ea6a3a3ecbb5474e2ae8e32d2ba055ea
Author:Tony Luck 
AuthorDate:Wed, 03 Jun 2020 10:33:52 -07:00
Committer: Borislav Petkov 
CommitterDate: Wed, 03 Jun 2020 19:53:41 +02:00

x86/cpu: Add Sapphire Rapids CPU model number

Latest edition (039) of "Intel Architecture Instruction Set Extensions
and Future Features Programming Reference" includes three new CPU model
numbers. Linux already has the two Ice Lake server ones. Add the new
model number for Sapphire Rapids.

Signed-off-by: Tony Luck 
Signed-off-by: Borislav Petkov 
Link: https://lkml.kernel.org/r/20200603173352.15506-1-tony.l...@intel.com
---
 arch/x86/include/asm/intel-family.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/intel-family.h 
b/arch/x86/include/asm/intel-family.h
index 8f1e94f..a338a6d 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -89,6 +89,8 @@
 #define INTEL_FAM6_COMETLAKE   0xA5
 #define INTEL_FAM6_COMETLAKE_L 0xA6
 
+#define INTEL_FAM6_SAPPHIRERAPIDS_X0x8F
+
 /* "Small Core" Processors (Atom) */
 
 #define INTEL_FAM6_ATOM_BONNELL0x1C /* Diamondville, Pineview 
*/


[tip: ras/core] x86/mce/dev-mcelog: Fix -Wstringop-truncation warning about strncpy()

2020-05-27 Thread tip-bot2 for Tony Luck
The following commit has been merged into the ras/core branch of tip:

Commit-ID: 45811ba140593e288a288c2a2e45d25f38d20d73
Gitweb:
https://git.kernel.org/tip/45811ba140593e288a288c2a2e45d25f38d20d73
Author:Tony Luck 
AuthorDate:Wed, 27 May 2020 11:28:08 -07:00
Committer: Borislav Petkov 
CommitterDate: Wed, 27 May 2020 21:19:38 +02:00

x86/mce/dev-mcelog: Fix -Wstringop-truncation warning about strncpy()

The kbuild test robot reported this warning:

  arch/x86/kernel/cpu/mce/dev-mcelog.c: In function 'dev_mcelog_init_device':
  arch/x86/kernel/cpu/mce/dev-mcelog.c:346:2: warning: 'strncpy' output \
truncated before terminating nul copying 12 bytes from a string of the \
same length [-Wstringop-truncation]

This is accurate, but I don't care that the trailing NUL character isn't
copied. The string being copied is just a magic number signature so that
crash dump tools can be sure they are decoding the right blob of memory.

Use memcpy() instead of strncpy().

Fixes: d8ecca4043f2 ("x86/mce/dev-mcelog: Dynamically allocate space for 
machine check records")
Reported-by: kbuild test robot 
Signed-off-by: Tony Luck 
Signed-off-by: Borislav Petkov 
Link: https://lkml.kernel.org/r/20200527182808.27737-1-tony.l...@intel.com
---
 arch/x86/kernel/cpu/mce/dev-mcelog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c 
b/arch/x86/kernel/cpu/mce/dev-mcelog.c
index a4fd528..43c4660 100644
--- a/arch/x86/kernel/cpu/mce/dev-mcelog.c
+++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c
@@ -349,7 +349,7 @@ static __init int dev_mcelog_init_device(void)
if (!mcelog)
return -ENOMEM;
 
-   strncpy(mcelog->signature, MCE_LOG_SIGNATURE, 
sizeof(mcelog->signature));
+   memcpy(mcelog->signature, MCE_LOG_SIGNATURE, sizeof(mcelog->signature));
mcelog->len = mce_log_len;
mcelog->recordlen = sizeof(struct mce);
 


[tip: ras/core] x86/{mce,mm}: Unmap the entire page if the whole page is affected and poisoned

2020-05-26 Thread tip-bot2 for Tony Luck
The following commit has been merged into the ras/core branch of tip:

Commit-ID: be69f6c5cd38c457c22f6e718077f6524437369d
Gitweb:
https://git.kernel.org/tip/be69f6c5cd38c457c22f6e718077f6524437369d
Author:Tony Luck 
AuthorDate:Wed, 20 May 2020 09:35:46 -07:00
Committer: Borislav Petkov 
CommitterDate: Mon, 25 May 2020 22:37:41 +02:00

x86/{mce,mm}: Unmap the entire page if the whole page is affected and poisoned

An interesting thing happened when a guest Linux instance took a machine
check. The VMM unmapped the bad page from guest physical space and
passed the machine check to the guest.

Linux took all the normal actions to offline the page from the process
that was using it. But then guest Linux crashed because it said there
was a second machine check inside the kernel with this stack trace:

do_memory_failure
set_mce_nospec
 set_memory_uc
  _set_memory_uc
   change_page_attr_set_clr
cpa_flush
 clflush_cache_range_opt

This was odd, because a CLFLUSH instruction shouldn't raise a machine
check (it isn't consuming the data). Further investigation showed that
the VMM had passed in another machine check because is appeared that the
guest was accessing the bad page.

Fix is to check the scope of the poison by checking the MCi_MISC register.
If the entire page is affected, then unmap the page. If only part of the
page is affected, then mark the page as uncacheable.

This assumes that VMMs will do the logical thing and pass in the "whole
page scope" via the MCi_MISC register (since they unmapped the entire
page).

  [ bp: Adjust to x86/entry changes. ]

Fixes: 284ce4011ba6 ("x86/memory_failure: Introduce {set, clear}_mce_nospec()")
Reported-by: Jue Wang 
Signed-off-by: Tony Luck 
Signed-off-by: Borislav Petkov 
Tested-by: Jue Wang 
Cc: 
Link: 
https://lkml.kernel.org/r/20200520163546.ga7...@agluck-desk2.amr.corp.intel.com
---
 arch/x86/include/asm/set_memory.h | 19 +--
 arch/x86/kernel/cpu/mce/core.c| 18 ++
 include/linux/sched.h |  4 +++-
 include/linux/set_memory.h|  2 +-
 4 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/set_memory.h 
b/arch/x86/include/asm/set_memory.h
index ec2c0a0..5948218 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -86,28 +86,35 @@ int set_direct_map_default_noflush(struct page *page);
 extern int kernel_set_to_readonly;
 
 #ifdef CONFIG_X86_64
-static inline int set_mce_nospec(unsigned long pfn)
+/*
+ * Prevent speculative access to the page by either unmapping
+ * it (if we do not require access to any part of the page) or
+ * marking it uncacheable (if we want to try to retrieve data
+ * from non-poisoned lines in the page).
+ */
+static inline int set_mce_nospec(unsigned long pfn, bool unmap)
 {
unsigned long decoy_addr;
int rc;
 
/*
-* Mark the linear address as UC to make sure we don't log more
-* errors because of speculative access to the page.
 * We would like to just call:
-*  set_memory_uc((unsigned long)pfn_to_kaddr(pfn), 1);
+*  set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
 * but doing that would radically increase the odds of a
 * speculative access to the poison page because we'd have
 * the virtual address of the kernel 1:1 mapping sitting
 * around in registers.
 * Instead we get tricky.  We create a non-canonical address
 * that looks just like the one we want, but has bit 63 flipped.
-* This relies on set_memory_uc() properly sanitizing any __pa()
+* This relies on set_memory_XX() properly sanitizing any __pa()
 * results with __PHYSICAL_MASK or PTE_PFN_MASK.
 */
decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
 
-   rc = set_memory_uc(decoy_addr, 1);
+   if (unmap)
+   rc = set_memory_np(decoy_addr, 1);
+   else
+   rc = set_memory_uc(decoy_addr, 1);
if (rc)
pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
return rc;
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index ffee8a2..753bc77 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -520,6 +520,14 @@ bool mce_is_memory_error(struct mce *m)
 }
 EXPORT_SYMBOL_GPL(mce_is_memory_error);
 
+static bool whole_page(struct mce *m)
+{
+   if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
+   return true;
+
+   return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
+}
+
 bool mce_is_correctable(struct mce *m)
 {
if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
@@ -573,7 +581,7 @@ static int uc_decode_notifier(struct notifier_block *nb, 
unsigned long val,
 
pfn = mce->addr >> PAGE_SHIFT;

[tip: ras/core] x86/{mce,mm}: Change so poison pages are either unmapped or marked uncacheable

2020-05-25 Thread tip-bot2 for Tony Luck
The following commit has been merged into the ras/core branch of tip:

Commit-ID: 3cb1ada80fe29e2fa022b5f20370b65718e0a744
Gitweb:
https://git.kernel.org/tip/3cb1ada80fe29e2fa022b5f20370b65718e0a744
Author:Tony Luck 
AuthorDate:Wed, 20 May 2020 09:35:46 -07:00
Committer: Borislav Petkov 
CommitterDate: Mon, 25 May 2020 12:46:40 +02:00

x86/{mce,mm}: Change so poison pages are either unmapped or marked uncacheable

An interesting thing happened when a guest Linux instance took a machine
check. The VMM unmapped the bad page from guest physical space and
passed the machine check to the guest.

Linux took all the normal actions to offline the page from the process
that was using it. But then guest Linux crashed because it said there
was a second machine check inside the kernel with this stack trace:

do_memory_failure
set_mce_nospec
 set_memory_uc
  _set_memory_uc
   change_page_attr_set_clr
cpa_flush
 clflush_cache_range_opt

This was odd, because a CLFLUSH instruction shouldn't raise a machine
check (it isn't consuming the data). Further investigation showed that
the VMM had passed in another machine check because is appeared that the
guest was accessing the bad page.

Fix is to check the scope of the poison by checking the MCi_MISC register.
If the entire page is affected, then unmap the page. If only part of the
page is affected, then mark the page as uncacheable.

This assumes that VMMs will do the logical thing and pass in the "whole
page scope" via the MCi_MISC register (since they unmapped the entire
page).

Fixes: 284ce4011ba6 ("x86/memory_failure: Introduce {set, clear}_mce_nospec()")
Reported-by: Jue Wang 
Signed-off-by: Tony Luck 
Signed-off-by: Borislav Petkov 
Tested-by: Jue Wang 
Cc: 
Link: 
https://lkml.kernel.org/r/20200520163546.ga7...@agluck-desk2.amr.corp.intel.com
---
 arch/x86/include/asm/set_memory.h | 19 +--
 arch/x86/kernel/cpu/mce/core.c| 11 +--
 include/linux/set_memory.h|  2 +-
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/set_memory.h 
b/arch/x86/include/asm/set_memory.h
index ec2c0a0..5948218 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -86,28 +86,35 @@ int set_direct_map_default_noflush(struct page *page);
 extern int kernel_set_to_readonly;
 
 #ifdef CONFIG_X86_64
-static inline int set_mce_nospec(unsigned long pfn)
+/*
+ * Prevent speculative access to the page by either unmapping
+ * it (if we do not require access to any part of the page) or
+ * marking it uncacheable (if we want to try to retrieve data
+ * from non-poisoned lines in the page).
+ */
+static inline int set_mce_nospec(unsigned long pfn, bool unmap)
 {
unsigned long decoy_addr;
int rc;
 
/*
-* Mark the linear address as UC to make sure we don't log more
-* errors because of speculative access to the page.
 * We would like to just call:
-*  set_memory_uc((unsigned long)pfn_to_kaddr(pfn), 1);
+*  set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
 * but doing that would radically increase the odds of a
 * speculative access to the poison page because we'd have
 * the virtual address of the kernel 1:1 mapping sitting
 * around in registers.
 * Instead we get tricky.  We create a non-canonical address
 * that looks just like the one we want, but has bit 63 flipped.
-* This relies on set_memory_uc() properly sanitizing any __pa()
+* This relies on set_memory_XX() properly sanitizing any __pa()
 * results with __PHYSICAL_MASK or PTE_PFN_MASK.
 */
decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
 
-   rc = set_memory_uc(decoy_addr, 1);
+   if (unmap)
+   rc = set_memory_np(decoy_addr, 1);
+   else
+   rc = set_memory_uc(decoy_addr, 1);
if (rc)
pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
return rc;
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 02e1f16..e35aece 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -518,6 +518,13 @@ bool mce_is_memory_error(struct mce *m)
 }
 EXPORT_SYMBOL_GPL(mce_is_memory_error);
 
+static bool whole_page(struct mce *m)
+{
+   if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
+   return true;
+   return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
+}
+
 bool mce_is_correctable(struct mce *m)
 {
if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
@@ -571,7 +578,7 @@ static int uc_decode_notifier(struct notifier_block *nb, 
unsigned long val,
 
pfn = mce->addr >> PAGE_SHIFT;
if (!memory_failure(pfn, 0)) {
-   set_mce_nospec(pfn);
+