[PATCH] sched,psi: fix typo in comment
s/exceution/execution/ s/possibe/possible/ s/manupulations/manipulations/ Signed-off-by: Xie XiuQi --- kernel/sched/psi.c | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 967732c0766c..7c800be47c6f 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -59,7 +59,7 @@ * states, we would have to conclude a CPU SOME pressure number of * 100%, since *somebody* is waiting on a runqueue at all * times. However, that is clearly not the amount of contention the - * workload is experiencing: only one out of 256 possible exceution + * workload is experiencing: only one out of 256 possible execution * threads will be contended at any given time, or about 0.4%. * * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any @@ -73,7 +73,7 @@ * we have to base our calculation on the number of non-idle tasks in * conjunction with the number of available CPUs, which is the number * of potential execution threads. SOME becomes then the proportion of - * delayed tasks to possibe threads, and FULL is the share of possible + * delayed tasks to possible threads, and FULL is the share of possible * threads that are unproductive due to delays: * * threads = min(nr_nonidle_tasks, nr_cpus) @@ -441,7 +441,8 @@ static void psi_avgs_work(struct work_struct *work) mutex_unlock(>avgs_lock); } -/* Trigger tracking window manupulations */ +/* Trigger tracking window manipulations */ + static void window_reset(struct psi_window *win, u64 now, u64 value, u64 prev_growth) { -- 2.25.1
[PATCH 0/2] soc/tegra: Fix build with Tegra234 configuration
If only Tegra234 support is enabled, we got this build error: make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- -j128 CALLscripts/atomic/check-atomics.sh CALLscripts/checksyscalls.sh CHK include/generated/compile.h CC drivers/soc/tegra/fuse/fuse-tegra30.o drivers/soc/tegra/fuse/fuse-tegra30.c:376:10: error: ‘tegra30_fuse_read’ undeclared here (not in a function); did you mean ‘tegra_fuse_readl’? .read = tegra30_fuse_read, ^ tegra_fuse_readl drivers/soc/tegra/fuse/fuse-tegra30.c:382:10: error: ‘tegra30_fuse_init’ undeclared here (not in a function); did you mean ‘tegra30_fuse_read’? .init = tegra30_fuse_init, ^ tegra30_fuse_read CC drivers/firmware/tegra/bpmp.o CHK kernel/kheaders_data.tar.xz drivers/firmware/tegra/bpmp.c:861:51: error: ‘tegra186_soc’ undeclared here (not in a function); did you mean ‘tegra_ivc’? { .compatible = "nvidia,tegra186-bpmp", .data = _soc }, ^~~~ tegra_ivc Xie XiuQi (2): soc/tegra: fuse: Fix build with Tegra234 configuration firmware: tegra: Fix build with Tegra234 configuration drivers/firmware/tegra/Makefile | 1 + drivers/firmware/tegra/bpmp-private.h | 3 ++- drivers/firmware/tegra/bpmp.c | 3 ++- drivers/soc/tegra/fuse/fuse-tegra30.c | 3 ++- 4 files changed, 7 insertions(+), 3 deletions(-) -- 2.25.1
[PATCH 2/2] firmware: tegra: Fix build with Tegra234 configuration
If only Tegra234 support is enabled, the tegra186_bpmp_ops and tegra186_soc are not defined and cause a build failure. Add Tegra234 to the preprocessor guard to make sure these functions are available for Tegra234-only builds as well. Fixes: 0ebdf11699d0 ("firmware: tegra: Enable BPMP support on Tegra234") Signed-off-by: Xie XiuQi --- drivers/firmware/tegra/Makefile | 1 + drivers/firmware/tegra/bpmp-private.h | 3 ++- drivers/firmware/tegra/bpmp.c | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/tegra/Makefile b/drivers/firmware/tegra/Makefile index 49c87e00fafb..620cf3fdd607 100644 --- a/drivers/firmware/tegra/Makefile +++ b/drivers/firmware/tegra/Makefile @@ -3,6 +3,7 @@ tegra-bpmp-y= bpmp.o tegra-bpmp-$(CONFIG_ARCH_TEGRA_210_SOC)+= bpmp-tegra210.o tegra-bpmp-$(CONFIG_ARCH_TEGRA_186_SOC)+= bpmp-tegra186.o tegra-bpmp-$(CONFIG_ARCH_TEGRA_194_SOC)+= bpmp-tegra186.o +tegra-bpmp-$(CONFIG_ARCH_TEGRA_234_SOC)+= bpmp-tegra186.o tegra-bpmp-$(CONFIG_DEBUG_FS) += bpmp-debugfs.o obj-$(CONFIG_TEGRA_BPMP) += tegra-bpmp.o obj-$(CONFIG_TEGRA_IVC)+= ivc.o diff --git a/drivers/firmware/tegra/bpmp-private.h b/drivers/firmware/tegra/bpmp-private.h index 54d560c48398..182bfe396516 100644 --- a/drivers/firmware/tegra/bpmp-private.h +++ b/drivers/firmware/tegra/bpmp-private.h @@ -24,7 +24,8 @@ struct tegra_bpmp_ops { }; #if IS_ENABLED(CONFIG_ARCH_TEGRA_186_SOC) || \ -IS_ENABLED(CONFIG_ARCH_TEGRA_194_SOC) +IS_ENABLED(CONFIG_ARCH_TEGRA_194_SOC) || \ +IS_ENABLED(CONFIG_ARCH_TEGRA_234_SOC) extern const struct tegra_bpmp_ops tegra186_bpmp_ops; #endif #if IS_ENABLED(CONFIG_ARCH_TEGRA_210_SOC) diff --git a/drivers/firmware/tegra/bpmp.c b/drivers/firmware/tegra/bpmp.c index 0742a90cb844..5654c5e9862b 100644 --- a/drivers/firmware/tegra/bpmp.c +++ b/drivers/firmware/tegra/bpmp.c @@ -809,7 +809,8 @@ static const struct dev_pm_ops tegra_bpmp_pm_ops = { }; #if IS_ENABLED(CONFIG_ARCH_TEGRA_186_SOC) || \ -IS_ENABLED(CONFIG_ARCH_TEGRA_194_SOC) +IS_ENABLED(CONFIG_ARCH_TEGRA_194_SOC) || \ +IS_ENABLED(CONFIG_ARCH_TEGRA_234_SOC) static const struct tegra_bpmp_soc tegra186_soc = { .channels = { .cpu_tx = { -- 2.25.1
[PATCH 1/2] soc/tegra: fuse: Fix build with Tegra234 configuration
If only Tegra234 support is enabled, the tegra30_fuse_read() and tegra30_fuse_init() function are not declared and cause a build failure. Add Tegra234 to the preprocessor guard to make sure these functions are available for Tegra234-only builds as well. Fixes: 1f44febf71ba ("soc/tegra: fuse: Add Tegra234 support") Signed-off-by: Xie XiuQi --- drivers/soc/tegra/fuse/fuse-tegra30.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/soc/tegra/fuse/fuse-tegra30.c b/drivers/soc/tegra/fuse/fuse-tegra30.c index 9ea7f0168457..c1aa7815bd6e 100644 --- a/drivers/soc/tegra/fuse/fuse-tegra30.c +++ b/drivers/soc/tegra/fuse/fuse-tegra30.c @@ -37,7 +37,8 @@ defined(CONFIG_ARCH_TEGRA_132_SOC) || \ defined(CONFIG_ARCH_TEGRA_210_SOC) || \ defined(CONFIG_ARCH_TEGRA_186_SOC) || \ -defined(CONFIG_ARCH_TEGRA_194_SOC) +defined(CONFIG_ARCH_TEGRA_194_SOC) || \ +defined(CONFIG_ARCH_TEGRA_234_SOC) static u32 tegra30_fuse_read_early(struct tegra_fuse *fuse, unsigned int offset) { if (WARN_ON(!fuse->base)) -- 2.25.1
[PATCH] scsi: qedi: remove unused variable udev & uctrl
uctrl and udev are unused after commit 9632a6b4b747 ("scsi: qedi: Move LL2 producer index processing in BH.") Remove them. Signed-off-by: Xie XiuQi --- drivers/scsi/qedi/qedi_main.c | 5 - 1 file changed, 5 deletions(-) diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c index b995b19865ca..313f7e10aed9 100644 --- a/drivers/scsi/qedi/qedi_main.c +++ b/drivers/scsi/qedi/qedi_main.c @@ -658,8 +658,6 @@ static struct qedi_ctx *qedi_host_alloc(struct pci_dev *pdev) static int qedi_ll2_rx(void *cookie, struct sk_buff *skb, u32 arg1, u32 arg2) { struct qedi_ctx *qedi = (struct qedi_ctx *)cookie; - struct qedi_uio_dev *udev; - struct qedi_uio_ctrl *uctrl; struct skb_work_list *work; struct ethhdr *eh; @@ -698,9 +696,6 @@ static int qedi_ll2_rx(void *cookie, struct sk_buff *skb, u32 arg1, u32 arg2) "Allowed frame ethertype [0x%x] len [0x%x].\n", eh->h_proto, skb->len); - udev = qedi->udev; - uctrl = udev->uctrl; - work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { QEDI_WARN(>dbg_ctx, -- 2.20.1
[PATCH] Input: atmel_mxt_ts: remove meaningless checking
byte_offset is unsigned integer, it's always >= 0, so remove this meaningless checking. Signed-off-by: Xie XiuQi --- drivers/input/touchscreen/atmel_mxt_ts.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/touchscreen/atmel_mxt_ts.c b/drivers/input/touchscreen/atmel_mxt_ts.c index ae60442efda0..1bd072c5a49d 100644 --- a/drivers/input/touchscreen/atmel_mxt_ts.c +++ b/drivers/input/touchscreen/atmel_mxt_ts.c @@ -1368,7 +1368,7 @@ static int mxt_prepare_cfg_mem(struct mxt_data *data, struct mxt_cfg *cfg) byte_offset = reg + i - cfg->start_ofs; - if (byte_offset >= 0 && byte_offset < cfg->mem_size) { + if (byte_offset < cfg->mem_size) { *(cfg->mem + byte_offset) = val; } else { dev_err(dev, "Bad object: reg:%d, T%d, ofs=%d\n", -- 2.20.1
[PATCH net-next] ixgbe: fix signed-integer-overflow warning
ubsan report this warning, fix it by adding a unsigned suffix. UBSAN: signed-integer-overflow in drivers/net/ethernet/intel/ixgbe/ixgbe_common.c:2246:26 65535 * 65537 cannot be represented in type 'int' CPU: 21 PID: 7 Comm: kworker/u256:0 Not tainted 5.7.0-rc3-debug+ #39 Hardware name: Huawei TaiShan 2280 V2/BC82AMDC, BIOS 2280-V2 03/27/2020 Workqueue: ixgbe ixgbe_service_task [ixgbe] Call trace: dump_backtrace+0x0/0x3f0 show_stack+0x28/0x38 dump_stack+0x154/0x1e4 ubsan_epilogue+0x18/0x60 handle_overflow+0xf8/0x148 __ubsan_handle_mul_overflow+0x34/0x48 ixgbe_fc_enable_generic+0x4d0/0x590 [ixgbe] ixgbe_service_task+0xc20/0x1f78 [ixgbe] process_one_work+0x8f0/0xf18 worker_thread+0x430/0x6d0 kthread+0x218/0x238 ret_from_fork+0x10/0x18 Reported-by: Hulk Robot Signed-off-by: Xie XiuQi --- drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c index 0bd1294ba517..39c5e6fdb72c 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c @@ -2243,7 +2243,7 @@ s32 ixgbe_fc_enable_generic(struct ixgbe_hw *hw) } /* Configure pause time (2 TCs per register) */ - reg = hw->fc.pause_time * 0x00010001; + reg = hw->fc.pause_time * 0x00010001U; for (i = 0; i < (MAX_TRAFFIC_CLASS / 2); i++) IXGBE_WRITE_REG(hw, IXGBE_FCTTV(i), reg); -- 2.20.1
[tip: sched/core] sched/debug: Fix trival print_task() format
The following commit has been merged into the sched/core branch of tip: Commit-ID: f080d93e1d419099a99d7473ed532289ca8dc717 Gitweb: https://git.kernel.org/tip/f080d93e1d419099a99d7473ed532289ca8dc717 Author:Xie XiuQi AuthorDate:Tue, 14 Apr 2020 20:57:21 +08:00 Committer: Peter Zijlstra CommitterDate: Thu, 30 Apr 2020 20:14:37 +02:00 sched/debug: Fix trival print_task() format Ensure leave one space between state and task name. w/o patch: runnable tasks: S task PID tree-key switches prio wait Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200414125721.195801-1-xiexi...@huawei.com --- kernel/sched/debug.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index a562df5..b3ac1c1 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -437,7 +437,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) else SEQ_printf(m, " %c", task_state_to_char(p)); - SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", + SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ", p->comm, task_pid_nr(p), SPLIT_NS(p->se.vruntime), (long long)(p->nvcsw + p->nivcsw), @@ -464,10 +464,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) SEQ_printf(m, "\n"); SEQ_printf(m, "runnable tasks:\n"); - SEQ_printf(m, " S task PID tree-key switches prio" + SEQ_printf(m, " Stask PID tree-key switches prio" " wait-time sum-execsum-sleep\n"); SEQ_printf(m, "---" - "\n"); + "--\n"); rcu_read_lock(); for_each_process_thread(g, p) {
Re: [PATCH] arm64: panic on synchronous external abort in kernel context
Hi James, Sorry for reply late. On 2020/4/14 22:53, James Morse wrote: > Hi Xie, > > On 14/04/2020 13:39, Xie XiuQi wrote: >> On 2020/4/14 20:16, James Morse wrote: >>> On 14/04/2020 11:59, Mark Rutland wrote: >>>> On Fri, Apr 10, 2020 at 09:52:45AM +0800, Xie XiuQi wrote: >>>>> We should panic even panic_on_oops is not set, when we can't recover >>>>> from synchronous external abort in kernel context. >>> >>> Hmm, fault-from-kernel-context doesn't mean the fault affects the kernel. >>> If the kernel is >>> reading or writing from user-space memory for a syscall, its the user-space >>> memory that is >>> affected. This thread can't make progress, so we kill it. >>> If its a kernel thread or we were in irq context, we panic(). >>> >>> I don't think you really want all faults that happen as a result of a >>> kernel access to be >>> fatal! > >> Yes, you're right. I just want to fix a hung up when ras error inject >> testing. >> >> panic_on_oops is not set in the kernel image for testing. When receiving a >> sea in kernel >> context, the PE trap in do_exit(), and can't return any more. > > trap? gets trapped, (or gets stuck, to prevent confusion with the > architectures use of the > word 'trap'!) > > >> I analyze the source code, the call trace might like this: >>do_mem_abort >> -> arm64_notify_die >> -> die# kernel context, call die() directly; >>-> do_exit # kernel process context, call >> do_exit(SIGSEGV); >> -> do_task_dead() # call do_task_dead(), and hung up this >> core; > > Thanks for the trace. This describes a corrected error in your I-cache, that > occurred > while the kernel was executing a kernel thread. These shouldn't be fatal, > because it was > corrected ... but the kernel doesn't know that because it doesn't know how to > parse those > records. > > There are two things wrong here: > 1. it locks up while trying to kill the thread. > 2. it tried to kill the thread in the first place! > > For 1, does your l1l2_inject module take any spinlocks or tinker with the > pre-empt counter? > > I suspect this is some rarely-tested path in do_task_dead() that sleeps, but > can't from > your l1l2_inject module because the pre-empt counter has been raised. > > CONFIG_DEBUG_ATOMIC_SLEEP may point at the function to blame. > > It may be accessing some thread data that kthreads don't have, taking a > second exception > and blocking on the die_lock. LOCKDEP should catch this one. > > We should fix this one first. > I analyze the l1l2_inject module, there is a kworker thread used to inject error. do_sea() try to kill the kworker thread, so the work(s) queued on this kworker is blocked. panic_on_oops option is usually set on production environment, so if someone unset this option for testing, he should take his own risk. Is it right? > > 2 is a bit more complicated. Today, this is fatal because the arch code > assumes this was > probably a memory error, and if it returns to user-space it can't avoid > getting stuck in a > loop until the scheduled memory_failure() work runs. Instead it > unconditionally signals > the process. > > [0] fixes this up for memory errors. But in this case it will assume all the > work has been > done by APEI, (or will be before we get back to user-space). APEI ignored the > processor > error you fed it, because it doesn't know what they are, they are just > printed out. > > This is fine for corrected errors, but were are reliant on your firmware > describing > uncorrected errors with a 'fatal' severity... which might be too heavy a > hammer. (Ideally > that would mean 'uncontained', and the kernel should handle, or detect it > can't, any other > errror...) > > We can discuss the right thing to do here when support for parsing these 'arm > processor > errors' is posted. > (If you think I need to do something different in [0] because of this, please > shout!) For some errors which could be recoverable or corrected, do_sea() should not kill process or die() unconditionally. It's better detect this situation. > > >> [ 387.740609] {1}[Hardware Error]: Hardware error from APEI Generic >> Hardware Error Source: 9 >> [ 387.748837] {1}[Hardware Error]: event severity: recoverable >> [ 387.754470] {1}[Hardware Error]: Error 0, type: recoverable > >> [ 387.760103] {1}[Hardware Error]: section_type: ARM processor error > > et voila! Case 2. Linux doesn't handl
Re: [PATCH] epoll: optimize epmutex in ep_free and eventpoll_release_file
* a lockdep warning. >*/ > - mutex_lock(>mtx); > while ((rbp = rb_first_cached(>rbr)) != NULL) { > epi = rb_entry(rbp, struct epitem, rbn); > ep_remove(ep, epi); > @@ -854,11 +885,19 @@ static void ep_free(struct eventpoll *ep) > } > mutex_unlock(>mtx); > > - mutex_unlock(); > - mutex_destroy(>mtx); > - free_uid(ep->user); > - wakeup_source_unregister(ep->ws); > - kfree(ep); > + /* > + * ep will not been added to visited_list, because ep_ctrl() > + * can not get its reference and can not reference it by the > + * corresponding epitem. The only possible operation is list_del_init, > + * so it's OK to use list_empty_careful() here. > + */ > + if (!list_empty_careful(>visited_list_link)) { > + mutex_lock(); > + list_del_init(>visited_list_link); > + mutex_unlock(); > + } > + > + eventpoll_put_ep(ep); > } > > static int ep_eventpoll_release(struct inode *inode, struct file *file) > @@ -985,7 +1024,7 @@ static const struct file_operations eventpoll_fops = { > void eventpoll_release_file(struct file *file) > { > struct eventpoll *ep; > - struct epitem *epi, *next; > + struct epitem *epi; > > /* >* We don't want to get "file->f_lock" because it is not > @@ -1000,14 +1039,51 @@ void eventpoll_release_file(struct file *file) >* >* Besides, ep_remove() acquires the lock, so we can't hold it here. >*/ > - mutex_lock(); > - list_for_each_entry_safe(epi, next, >f_ep_links, fllink) { > - ep = epi->ep; > + rcu_read_lock(); > + while (true) { > + epi = list_first_or_null_rcu(>f_ep_links, > + struct epitem, fllink); > + if (!epi) > + break; > + > + ep = eventpoll_get_ep(epi->ep); > + /* Current epi had been removed by ep_free() */ > + if (!ep) > + continue; > + rcu_read_unlock(); > + > mutex_lock_nested(>mtx, 0); > - ep_remove(ep, epi); > + /* > + * If rb_first_cached() returns NULL, it means that > + * the current epi had been removed by ep_free(). > + * To prevent epi from double-freeing, check the > + * condition before invoking ep_remove(). > + * If eventpoll_release_file() frees epi firstly, > + * the epi will not be freed again because the epi > + * must have been removed from ep->rbr when ep_free() > + * is invoked. > + */ > + if (rb_first_cached(>rbr)) > + ep_remove(ep, epi); > mutex_unlock(>mtx); > + > + eventpoll_put_ep(ep); > + > + rcu_read_lock(); > + } > + rcu_read_unlock(); > + > + /* > + * The file can not been added to tfile_check_list again, because > + * (1) refcnt has been zero, ep_ctrl() can no longer get its reference > + * (2) related ep items have been removed, ep_loop_check_proc() can not > + * get the file by ep->rbr. > + */ > + if (!list_empty_careful(>f_tfile_llink)) { > + mutex_lock(); > + list_del_init(>f_tfile_llink); > + mutex_unlock(); > } > - mutex_unlock(); > } > > static int ep_alloc(struct eventpoll **pep) > @@ -1030,6 +1106,8 @@ static int ep_alloc(struct eventpoll **pep) > ep->rbr = RB_ROOT_CACHED; > ep->ovflist = EP_UNACTIVE_PTR; > ep->user = user; > + INIT_LIST_HEAD(>visited_list_link); > + refcount_set(>ref, 1); > > *pep = ep; > > @@ -2018,7 +2096,7 @@ static int ep_loop_check(struct eventpoll *ep, struct > file *file) > list_for_each_entry_safe(ep_cur, ep_next, _list, > visited_list_link) { > ep_cur->visited = 0; > - list_del(_cur->visited_list_link); > + list_del_init(_cur->visited_list_link); > } > return ret; > } > -- Thanks, Xie XiuQi
[tip:sched/urgent] sched/numa: Fix a possible divide-by-zero
Commit-ID: a860fa7b96e1a1c974556327aa1aee852d434c21 Gitweb: https://git.kernel.org/tip/a860fa7b96e1a1c974556327aa1aee852d434c21 Author: Xie XiuQi AuthorDate: Sat, 20 Apr 2019 16:34:16 +0800 Committer: Ingo Molnar CommitDate: Thu, 25 Apr 2019 19:58:54 +0200 sched/numa: Fix a possible divide-by-zero sched_clock_cpu() may not be consistent between CPUs. If a task migrates to another CPU, then se.exec_start is set to that CPU's rq_clock_task() by update_stats_curr_start(). Specifically, the new value might be before the old value due to clock skew. So then if in numa_get_avg_runtime() the expression: 'now - p->last_task_numa_placement' ends up as -1, then the divider '*period + 1' in task_numa_placement() is 0 and things go bang. Similar to update_curr(), check if time goes backwards to avoid this. [ peterz: Wrote new changelog. ] [ mingo: Tweaked the code comment. ] Signed-off-by: Xie XiuQi Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: cj.chengj...@huawei.com Cc: Link: http://lkml.kernel.org/r/20190425080016.gx11...@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 1 file changed, 4 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a4d9e14bf138..35f3ea375084 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2007,6 +2007,10 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) if (p->last_task_numa_placement) { delta = runtime - p->last_sum_exec_runtime; *period = now - p->last_task_numa_placement; + + /* Avoid time going backwards, prevent potential divide error: */ + if (unlikely((s64)*period < 0)) + *period = 0; } else { delta = p->se.avg.load_sum; *period = LOAD_AVG_MAX;
[tip:sched/urgent] sched/numa: Fix a possible divide-by-zero
Commit-ID: 993efef2483bde15fd33ec5fba5464597c2d8124 Gitweb: https://git.kernel.org/tip/993efef2483bde15fd33ec5fba5464597c2d8124 Author: Xie XiuQi AuthorDate: Sat, 20 Apr 2019 16:34:16 +0800 Committer: Ingo Molnar CommitDate: Thu, 25 Apr 2019 19:58:35 +0200 sched/numa: Fix a possible divide-by-zero sched_clock_cpu() may not be consistent between CPUs. If a task migrates to another CPU, then se.exec_start is set to that CPU's rq_clock_task() by update_stats_curr_start(). Specifically, the new value might be before the old value due to clock skew. So then if in numa_get_avg_runtime() the expression: 'now - p->last_task_numa_placement' ends up as -1, then the divider '*period + 1' in task_numa_placement() is 0 and things go bang. Similar to update_curr(), check if time goes backwards to avoid this. [ peterz: Wrote new changelog. ] [ mingo: Tweaked the code comment. ] Signed-off-by: Xie XiuQi Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: cj.chengj...@huawei.com Link: http://lkml.kernel.org/r/20190425080016.gx11...@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 1 file changed, 4 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a4d9e14bf138..35f3ea375084 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2007,6 +2007,10 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) if (p->last_task_numa_placement) { delta = runtime - p->last_sum_exec_runtime; *period = now - p->last_task_numa_placement; + + /* Avoid time going backwards, prevent potential divide error: */ + if (unlikely((s64)*period < 0)) + *period = 0; } else { delta = p->se.avg.load_sum; *period = LOAD_AVG_MAX;
Re: [PATCH] sched: fix a potential divide error
Hi Peter, On 2019/4/25 16:00, Peter Zijlstra wrote: > On Thu, Apr 25, 2019 at 11:52:28AM +0800, Xie XiuQi wrote: >> On 2019/4/24 2:44, Peter Zijlstra wrote: > >>> I'll try and come up with a better Changelog tomorrow. > > I actually did, but forgot to send out. I have the below. > Does that work for you? It works for me, thank you very much. > > --- > Subject: sched/numa: Fix a possible divide-by-zero > From: Xie XiuQi > Date: Sat, 20 Apr 2019 16:34:16 +0800 > > sched_clock_cpu() may not be consistent between CPUs. If a task > migrates to another CPU, then se.exec_start is set to that CPU's > rq_clock_task() by update_stats_curr_start(). Specifically, the new > value might be before the old value due to clock skew. > > So then if in numa_get_avg_runtime() the expression: > > 'now - p->last_task_numa_placement' > > ends up as -1, then the divider '*period + 1' in task_numa_placement() > is 0 and things go bang. Similar to update_curr(), check if time goes > backwards to avoid this. > > Cc: mi...@redhat.com > Cc: cj.chengj...@huawei.com > Signed-off-by: Xie XiuQi > [peterz: simplified changelog] > Signed-off-by: Peter Zijlstra (Intel) > Link: https://lkml.kernel.org/r/20190420083416.170446-1-xiexi...@huawei.com > --- > kernel/sched/fair.c |4 > 1 file changed, 4 insertions(+) > > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -2007,6 +2007,10 @@ static u64 numa_get_avg_runtime(struct t > if (p->last_task_numa_placement) { > delta = runtime - p->last_sum_exec_runtime; > *period = now - p->last_task_numa_placement; > + > + /* Avoid backward, and prevent potential divide error */ > + if (unlikely((s64)*period < 0)) > + *period = 0; > } else { > delta = p->se.avg.load_sum; > *period = LOAD_AVG_MAX; > > . > -- Thanks, Xie XiuQi
Re: [PATCH] sched: fix a potential divide error
Hi Peter, Thanks for your comments. On 2019/4/24 2:44, Peter Zijlstra wrote: > On Sat, Apr 20, 2019 at 04:34:16PM +0800, Xie XiuQi wrote: >> We meet a divide error on 3.10.0 kernel, the error message is bellow: > > That is a _realll_ old kernel. I would urge you to upgrade. I will. > >> [42.287996] divide error: [#1] SMP > >> sched_clock_cpu may not be consistent bwtwen cpus. If a task migrate >> to another cpu, the se.exec_start was set to that cpu's rq_clock_task >> by update_stats_curr_start(). Which may not be monotonic. >> >> update_stats_curr_start >> <- set_next_entity >> <- set_curr_task_fair >> <- sched_move_task > > That is not in fact a cross-cpu migration path. But I see the point. > Also many migration paths do in fact preserve monotonicity, even when > the clock is busted, but you're right, not all of them. > >> So, if now - p->last_task_numa_placement is -1, then (*period + 1) is >> 0, and divide error was triggerred at the div operation: >> task_numa_placement: >> runtime = numa_get_avg_runtime(p, ); >> f_weight = div64_u64(runtime << 16, period + 1); // divide error here >> >> In this patch, we make sure period is not less than 0 to avoid this >> divide error. >> >> Signed-off-by: Xie XiuQi >> Cc: sta...@vger.kernel.org >> --- >> kernel/sched/fair.c | 4 >> 1 file changed, 4 insertions(+) >> >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >> index 40bd1e27b1b7..f2abb258fc85 100644 >> --- a/kernel/sched/fair.c >> +++ b/kernel/sched/fair.c >> @@ -2007,6 +2007,10 @@ static u64 numa_get_avg_runtime(struct task_struct >> *p, u64 *period) >> if (p->last_task_numa_placement) { >> delta = runtime - p->last_sum_exec_runtime; >> *period = now - p->last_task_numa_placement; >> + >> +/* Avoid backward, and prevent potential divide error */ >> +if ((s64)*period < 0) >> +*period = 0; >> } else { >> delta = p->se.avg.load_sum; >> *period = LOAD_AVG_MAX; > > Yeah, I suppose that is indeed correct. > > I'll try and come up with a better Changelog tomorrow. Thanks. -- Thanks, Xie XiuQi
[PATCH] sched: fix a potential divide error
We meet a divide error on 3.10.0 kernel, the error message is bellow: [42.287996] divide error: [#1] SMP [42.297796] do nothing after die! [42.299108] Modules linked in: signo_catch macvlan binfmt_misc ip_set_hash_netport ip_set_hash_ipport vport_vxlan ipt_REJECT xt_statistic xt_physdev xt_nat xt_recent xt_mark xt_comment ... [42.312751] CPU: 8 PID: 23352 Comm: bash Tainted: G V--- 3.10.0+ #1 [42.314308] Hardware name: OpenStack Foundation OpenStack Nova, BIOS rel-1.9.1-0-gb3ef39f-20170329_185309-build9a64a246a231 04/01/2014 [42.317411] task: 880033fc9700 ti: 8807fed6 task.ti:8807fed6 [42.318967] RIP: 0010:[] [] task_numa_fault+0x1c2/0xbb0 [42.320515] RSP: :8807fed63d38 EFLAGS: 00010246 [42.322018] RAX: 002b7efd RBX: 880033fc9700 RCX:0003 [42.323563] RDX: RSI: 0400 RDI:81a80f60 [42.325052] RBP: 8807fed63db8 R08: 81a80f68 R09: [42.326531] R10: 88083ffda000 R11: R12:0424 [42.327987] R13: 002b7efd R14: R15:ea001ea42a00 [42.329420] FS: 7fa01a3b7740() GS:88103ec0()knlGS: [42.330866] CS: 0010 DS: ES: CR0: 80050033 [42.332302] CR2: 00ff1fb0 CR3: 0007ff1d1000 CR4:003407e0 [42.333763] DR0: DR1: DR2: [42.335187] DR3: DR6: fffe0ff0 DR7:0400 [42.336595] Stack: [42.337974] 0001bc9598a8 ea001ea42a00 00010001 [42.339374] 00030001 0001 ea001ea42a00 8807fed63db8 [42.340768] bc9598a8 0001 [42.342148] Call Trace: [42.343494] [] do_numa_page+0x162/0x1f0 [42.344831] [] handle_mm_fault+0x627/0xf50 [42.346145] [] __do_page_fault+0x166/0x470 [42.347442] [] trace_do_page_fault+0x43/0x110 [42.348711] [] do_async_page_fault+0x29/0xe0 [42.349948] [] async_page_fault+0x28/0x30 [42.351149] Code: 00 3d 00 04 00 00 44 0f 4e d8 41 81 fb 00 04 00 00 0f 84 67 07 00 00 4c 89 e8 49 83 c6 01 31 d2 48 c1 e0 10 49 83 c4 01 45 31 c9 <49> f7 f6 48 c7 45 a8 00 00 00 00 48 c7 45 b0 00 00 00 00 49 89 [42.353707] RIP [] task_numa_fault+0x1c2/0xbb0 [42.354927] RSP [42.358114] ---[ end trace 4f2465cac18ff65e ]--- [42.359304] Kernel panic - not syncing: Fatal exception sched_clock_cpu may not be consistent bwtwen cpus. If a task migrate to another cpu, the se.exec_start was set to that cpu's rq_clock_task by update_stats_curr_start(). Which may not be monotonic. update_stats_curr_start <- set_next_entity <- set_curr_task_fair <- sched_move_task So, if now - p->last_task_numa_placement is -1, then (*period + 1) is 0, and divide error was triggerred at the div operation: task_numa_placement: runtime = numa_get_avg_runtime(p, ); f_weight = div64_u64(runtime << 16, period + 1); // divide error here In this patch, we make sure period is not less than 0 to avoid this divide error. Signed-off-by: Xie XiuQi Cc: sta...@vger.kernel.org --- kernel/sched/fair.c | 4 1 file changed, 4 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 40bd1e27b1b7..f2abb258fc85 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2007,6 +2007,10 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) if (p->last_task_numa_placement) { delta = runtime - p->last_sum_exec_runtime; *period = now - p->last_task_numa_placement; + + /* Avoid backward, and prevent potential divide error */ + if ((s64)*period < 0) + *period = 0; } else { delta = p->se.avg.load_sum; *period = LOAD_AVG_MAX; -- 2.20.1
Re: KASAN: use-after-free Write in _free_event
ash yet. > > Is there a chance of getting a reproducer for this one? > > Thanks, > -- > Alex > > . > -- Thanks, Xie XiuQi == BUG: KASAN: use-after-free in atomic_dec_and_test include/asm-generic/atomic-instrumented.h:259 [inline] BUG: KASAN: use-after-free in put_task_struct include/linux/sched/task.h:95 [inline] BUG: KASAN: use-after-free in _free_event+0x2c4/0xfe0 kernel/events/core.c:4452 Write of size 4 at addr 8883d8d80020 by task syz-executor0/28620 CPU: 1 PID: 28620 Comm: syz-executor0 Not tainted 4.19.18-514.55.6.9.x86_64+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0xca/0x13e lib/dump_stack.c:113 print_address_description+0x79/0x330 mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report+0x18a/0x2e0 mm/kasan/report.c:412 atomic_dec_and_test include/asm-generic/atomic-instrumented.h:259 [inline] put_task_struct include/linux/sched/task.h:95 [inline] _free_event+0x2c4/0xfe0 kernel/events/core.c:4452 free_event+0x41/0xa0 kernel/events/core.c:4473 perf_event_release_kernel+0x43b/0xc70 kernel/events/core.c:4634 perf_release+0x33/0x40 kernel/events/core.c:4648 __fput+0x27f/0x7f0 fs/file_table.c:278 task_work_run+0x136/0x1b0 kernel/task_work.c:113 tracehook_notify_resume include/linux/tracehook.h:193 [inline] exit_to_usermode_loop+0x1a7/0x1d0 arch/x86/entry/common.c:166 prepare_exit_to_usermode arch/x86/entry/common.c:197 [inline] syscall_return_slowpath arch/x86/entry/common.c:268 [inline] do_syscall_64+0x461/0x580 arch/x86/entry/common.c:293 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4115f7 Code: 00 00 0f 05 48 3d 00 f0 ff ff 77 3f f3 c3 0f 1f 44 00 00 53 89 fb 48 83 ec 10 e8 f4 fb ff ff 89 df 89 c2 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 2b 89 d7 89 44 24 0c e8 36 fc ff ff 8b 44 24 RSP: 002b:7ffe24d0ba80 EFLAGS: 0293 ORIG_RAX: 0003 RAX: RBX: 0005 RCX: 004115f7 RDX: RSI: 00730f30 RDI: 0005 RBP: 7ffe24d0baec R08: 7ffe24d0b9b0 R09: 7ffe24d0b9b0 R10: 7ffe24d0b9c0 R11: 0293 R12: 0001 R13: 0001 R14: R15: Allocated by task 28624: set_track mm/kasan/kasan.c:460 [inline] kasan_kmalloc+0xa0/0xd0 mm/kasan/kasan.c:553 slab_post_alloc_hook mm/slab.h:444 [inline] slab_alloc_node mm/slub.c:2706 [inline] kmem_cache_alloc_node+0xe3/0x2e0 mm/slub.c:2742 alloc_task_struct_node kernel/fork.c:157 [inline] dup_task_struct kernel/fork.c:802 [inline] copy_process.part.26+0x1a98/0x6c70 kernel/fork.c:1707 copy_process kernel/fork.c:1664 [inline] _do_fork+0x1c6/0xe60 kernel/fork.c:2175 do_syscall_64+0xc8/0x580 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 28624: set_track mm/kasan/kasan.c:460 [inline] __kasan_slab_free+0x12e/0x180 mm/kasan/kasan.c:521 slab_free_hook mm/slub.c:1371 [inline] slab_free_freelist_hook mm/slub.c:1398 [inline] slab_free mm/slub.c:2953 [inline] kmem_cache_free+0xbd/0x320 mm/slub.c:2969 copy_process.part.26+0x18d9/0x6c70 kernel/fork.c:2107 copy_process kernel/fork.c:1664 [inline] _do_fork+0x1c6/0xe60 kernel/fork.c:2175 do_syscall_64+0xc8/0x580 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe The buggy address belongs to the object at 8883d8d8 which belongs to the cache task_struct of size 9992 The buggy address is located 32 bytes inside of 9992-byte region [8883d8d8, 8883d8d82708) The buggy address belongs to the page: page:ea000f636000 count:1 mapcount:0 mapping:888107d07a00 index:0x0 compound_mapcount: 0 flags: 0x2f80008100(slab|head) raw: 002f80008100 00020001 888107d07a00 raw: 80030003 0001 page dumped because: kasan: bad access detected Memory state around the buggy address: 8883d8d7ff00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 8883d8d7ff80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >8883d8d8: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ 8883d8d80080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb 8883d8d80100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb BUG: KASAN: use-after-free in atomic_dec_and_test include/asm-generic/atomic-instrumented.h:259 [inline] BUG: KASAN: use-after-free in put_task_struct include/linux/sched/task.h:95 [inline] BUG: KASAN: use-after-free in _free_event+0x2c4/0xfe0 kernel/events/core.c:4452 Write of size 4 at addr 8883780cd0a0 by task syz-executor0/9553 CPU: 3 PID: 9553 Comm: syz-executor0 Not tai
Re: [PATCH] sched: fix infinity loop in update_blocked_averages
Hi Tejun, On 2018/12/28 10:02, Tejun Heo wrote: > On Thu, Dec 27, 2018 at 05:53:52PM -0800, Tejun Heo wrote: >> Vincent knows that part way better than me but I think the safest way >> would be doing the optimization removal iff tmp_alone_branch is >> already pointing to leaf_cfs_rq_list. IIUC, it's pointing to >> something else only while a branch is being built and deferring >> optimization removal by an avg update cycle isn't gonna make any >> difference anyway. > > So, something like the following. Xie, can you see whether the > following patch resolves the problem? Zhipeng is preparing to test it, thanks. > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index d1907506318a..88b9118b5191 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -7698,7 +7698,8 @@ static void update_blocked_averages(int cpu) >* There can be a lot of idle CPU cgroups. Don't let fully >* decayed cfs_rqs linger on the list. >*/ > - if (cfs_rq_is_decayed(cfs_rq)) > + if (cfs_rq_is_decayed(cfs_rq) && > + rq->tmp_alone_branch == >leaf_cfs_rq_list) > list_del_leaf_cfs_rq(cfs_rq); > > /* Don't need periodic decay once load/util_avg are null */ > > . > -- Thanks, Xie XiuQi
[PATCH] sched: fix infinity loop in update_blocked_averages
Zhepeng Xie report a bug, there is a infinity loop in update_blocked_averages(). PID: 14233 TASK: 800b2de08fc0 CPU: 1 COMMAND: "docker" #0 [2213b9d0] update_blocked_averages at 0811e4a8 #1 [2213ba60] pick_next_task_fair at 0812a3b4 #2 [2213baf0] __schedule at 08deaa88 #3 [2213bb70] schedule at 08deb1b8 #4 [2213bb80] futex_wait_queue_me at 08180754 #5 [2213bbd0] futex_wait at 0818192c #6 [2213bd00] do_futex at 08183ee4 #7 [2213bde0] __arm64_sys_futex at 08184398 #8 [2213be60] el0_svc_common at 080979ac #9 [2213bea0] el0_svc_handler at 08097a6c #10 [2213bff0] el0_svc at 08084044 rq->tmp_alone_branch introduced in 4.10, used to point to the new beg of the list. If this cfs_rq is deleted somewhere else, then the tmp_alone_branch will be illegal and cause a list_add corruption. (When enabled DEBUG_LIST, we fould this list_add corruption) [ 2546.741103] list_add corruption. next->prev should be prev (800b4d61ad40), but was 800ba434fa38. (next=800b6a95e740). [ 2546.741130] [ cut here ] [ 2546.741132] kernel BUG at lib/list_debug.c:25! [ 2546.741136] Internal error: Oops - BUG: 0 [#1] SMP [ 2546.742870] CPU: 1 PID: 29428 Comm: docker-runc Kdump: loaded Tainted: G E 4.19.5-1.aarch64 #1 [ 2546.745415] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 [ 2546.747402] pstate: 4085 (nZcv daIf -PAN -UAO) [ 2546.749015] pc : __list_add_valid+0x50/0x90 [ 2546.750485] lr : __list_add_valid+0x50/0x90 [ 2546.751975] sp : 1b5eb910 [ 2546.753286] x29: 1b5eb910 x28: 800abacf [ 2546.754976] x27: 1b5ebbb0 x26: 0957 [ 2546.756665] x25: 0960d000 x24: 0250f41ca8f8 [ 2546.758366] x23: 800b6a95e740 x22: 800b4d61ad40 [ 2546.760066] x21: 800b4d61ad40 x20: 800ba434f080 [ 2546.761742] x19: 800b4d61ac00 x18: [ 2546.763425] x17: x16: [ 2546.765089] x15: 09570748 x14: 662073617720 [ 2546.766755] x13: 747562202c293034 x12: 6461313664346230 [ 2546.768429] x11: 30382820 x10: [ 2546.770124] x9 : 0001 x8 : 09f34a0f [ 2546.771831] x7 : x6 : 250d [ 2546.773525] x5 : x4 : [ 2546.775227] x3 : x2 : 70ef7f624013ca00 [ 2546.776929] x1 : x0 : 0075 [ 2546.778623] Process docker-runc (pid: 29428, stack limit = 0x293494a2) [ 2546.780742] Call trace: [ 2546.781955] __list_add_valid+0x50/0x90 [ 2546.783469] enqueue_entity+0x4a0/0x6e8 [ 2546.784957] enqueue_task_fair+0xac/0x610 [ 2546.786502] sched_move_task+0x134/0x178 [ 2546.787993] cpu_cgroup_attach+0x40/0x78 [ 2546.789540] cgroup_migrate_execute+0x378/0x3a8 [ 2546.791169] cgroup_migrate+0x6c/0x90 [ 2546.792663] cgroup_attach_task+0x148/0x238 [ 2546.794211] __cgroup1_procs_write.isra.2+0xf8/0x160 [ 2546.795935] cgroup1_procs_write+0x38/0x48 [ 2546.797492] cgroup_file_write+0xa0/0x170 [ 2546.799010] kernfs_fop_write+0x114/0x1e0 [ 2546.800558] __vfs_write+0x60/0x190 [ 2546.801977] vfs_write+0xac/0x1c0 [ 2546.803341] ksys_write+0x6c/0xd8 [ 2546.804674] __arm64_sys_write+0x24/0x30 [ 2546.806146] el0_svc_common+0x78/0x100 [ 2546.807584] el0_svc_handler+0x38/0x88 [ 2546.809017] el0_svc+0x8/0xc In this patch, we move rq->tmp_alone_branch point to its prev before delete it from list. Reported-by: Zhipeng Xie Cc: Bin Li Cc: [4.10+] Fixes: 9c2791f936ef (sched/fair: Fix hierarchical order in rq->leaf_cfs_rq_list) Signed-off-by: Xie XiuQi Tested-by: Zhipeng Xie --- kernel/sched/fair.c | 5 + 1 file changed, 5 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ac855b2..7a72702 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -347,6 +347,11 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (cfs_rq->on_list) { + struct rq *rq = rq_of(cfs_rq); + + if (rq->tmp_alone_branch == _rq->leaf_cfs_rq_list) + rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev; + list_del_rcu(_rq->leaf_cfs_rq_list); cfs_rq->on_list = 0; } -- 1.8.3.1
Re: [PATCH 1/2] arm64: avoid alloc memory on offline node
Hi Lorenzo, Punit, On 2018/6/20 0:32, Lorenzo Pieralisi wrote: > On Tue, Jun 19, 2018 at 04:35:40PM +0100, Punit Agrawal wrote: >> Michal Hocko writes: >> >>> On Tue 19-06-18 15:54:26, Punit Agrawal wrote: >>> [...] >>>> In terms of $SUBJECT, I wonder if it's worth taking the original patch >>>> as a temporary fix (it'll also be easier to backport) while we work on >>>> fixing these other issues and enabling memoryless nodes. >>> >>> Well, x86 already does that but copying this antipatern is not really >>> nice. So it is good as a quick fix but it would be definitely much >>> better to have a robust fix. Who knows how many other places might hit >>> this. You certainly do not want to add a hack like this all over... >> >> Completely agree! I was only suggesting it as a temporary measure, >> especially as it looked like a proper fix might be invasive. >> >> Another fix might be to change the node specific allocation to node >> agnostic allocations. It isn't clear why the allocation is being >> requested from a specific node. I think Lorenzo suggested this in one of >> the threads. > > I think that code was just copypasted but it is better to fix the > underlying issue. > >> I've started putting together a set fixing the issues identified in this >> thread. It should give a better idea on the best course of action. > > On ACPI ARM64, this diff should do if I read the code correctly, it > should be (famous last words) just a matter of mapping PXMs to nodes for > every SRAT GICC entry, feel free to pick it up if it works. > > Yes, we can take the original patch just because it is safer for an -rc > cycle even though if the patch below would do delaying the fix for a > couple of -rc (to get it tested across ACPI ARM64 NUMA platforms) is > not a disaster. I tested this patch on my arm board, it works. -- Thanks, Xie XiuQi > > Lorenzo > > -- >8 -- > diff --git a/arch/arm64/kernel/acpi_numa.c b/arch/arm64/kernel/acpi_numa.c > index d190a7b231bf..877b268ef9fa 100644 > --- a/arch/arm64/kernel/acpi_numa.c > +++ b/arch/arm64/kernel/acpi_numa.c > @@ -70,12 +70,6 @@ void __init acpi_numa_gicc_affinity_init(struct > acpi_srat_gicc_affinity *pa) > if (!(pa->flags & ACPI_SRAT_GICC_ENABLED)) > return; > > - if (cpus_in_srat >= NR_CPUS) { > - pr_warn_once("SRAT: cpu_to_node_map[%d] is too small, may not > be able to use all cpus\n", > - NR_CPUS); > - return; > - } > - > pxm = pa->proximity_domain; > node = acpi_map_pxm_to_node(pxm); > > @@ -85,6 +79,14 @@ void __init acpi_numa_gicc_affinity_init(struct > acpi_srat_gicc_affinity *pa) > return; > } > > + node_set(node, numa_nodes_parsed); > + > + if (cpus_in_srat >= NR_CPUS) { > + pr_warn_once("SRAT: cpu_to_node_map[%d] is too small, may not > be able to use all cpus\n", > + NR_CPUS); > + return; > + } > + > mpidr = acpi_map_madt_entry(pa->acpi_processor_uid); > if (mpidr == PHYS_CPUID_INVALID) { > pr_err("SRAT: PXM %d with ACPI ID %d has no valid MPIDR in > MADT\n", > @@ -95,7 +97,6 @@ void __init acpi_numa_gicc_affinity_init(struct > acpi_srat_gicc_affinity *pa) > > early_node_cpu_hwid[cpus_in_srat].node_id = node; > early_node_cpu_hwid[cpus_in_srat].cpu_hwid = mpidr; > - node_set(node, numa_nodes_parsed); > cpus_in_srat++; > pr_info("SRAT: PXM %d -> MPIDR 0x%Lx -> Node %d\n", > pxm, mpidr, node); > > . >
Re: [PATCH 1/2] arm64: avoid alloc memory on offline node
Hi Lorenzo, Punit, On 2018/6/20 0:32, Lorenzo Pieralisi wrote: > On Tue, Jun 19, 2018 at 04:35:40PM +0100, Punit Agrawal wrote: >> Michal Hocko writes: >> >>> On Tue 19-06-18 15:54:26, Punit Agrawal wrote: >>> [...] >>>> In terms of $SUBJECT, I wonder if it's worth taking the original patch >>>> as a temporary fix (it'll also be easier to backport) while we work on >>>> fixing these other issues and enabling memoryless nodes. >>> >>> Well, x86 already does that but copying this antipatern is not really >>> nice. So it is good as a quick fix but it would be definitely much >>> better to have a robust fix. Who knows how many other places might hit >>> this. You certainly do not want to add a hack like this all over... >> >> Completely agree! I was only suggesting it as a temporary measure, >> especially as it looked like a proper fix might be invasive. >> >> Another fix might be to change the node specific allocation to node >> agnostic allocations. It isn't clear why the allocation is being >> requested from a specific node. I think Lorenzo suggested this in one of >> the threads. > > I think that code was just copypasted but it is better to fix the > underlying issue. > >> I've started putting together a set fixing the issues identified in this >> thread. It should give a better idea on the best course of action. > > On ACPI ARM64, this diff should do if I read the code correctly, it > should be (famous last words) just a matter of mapping PXMs to nodes for > every SRAT GICC entry, feel free to pick it up if it works. > > Yes, we can take the original patch just because it is safer for an -rc > cycle even though if the patch below would do delaying the fix for a > couple of -rc (to get it tested across ACPI ARM64 NUMA platforms) is > not a disaster. I tested this patch on my arm board, it works. -- Thanks, Xie XiuQi > > Lorenzo > > -- >8 -- > diff --git a/arch/arm64/kernel/acpi_numa.c b/arch/arm64/kernel/acpi_numa.c > index d190a7b231bf..877b268ef9fa 100644 > --- a/arch/arm64/kernel/acpi_numa.c > +++ b/arch/arm64/kernel/acpi_numa.c > @@ -70,12 +70,6 @@ void __init acpi_numa_gicc_affinity_init(struct > acpi_srat_gicc_affinity *pa) > if (!(pa->flags & ACPI_SRAT_GICC_ENABLED)) > return; > > - if (cpus_in_srat >= NR_CPUS) { > - pr_warn_once("SRAT: cpu_to_node_map[%d] is too small, may not > be able to use all cpus\n", > - NR_CPUS); > - return; > - } > - > pxm = pa->proximity_domain; > node = acpi_map_pxm_to_node(pxm); > > @@ -85,6 +79,14 @@ void __init acpi_numa_gicc_affinity_init(struct > acpi_srat_gicc_affinity *pa) > return; > } > > + node_set(node, numa_nodes_parsed); > + > + if (cpus_in_srat >= NR_CPUS) { > + pr_warn_once("SRAT: cpu_to_node_map[%d] is too small, may not > be able to use all cpus\n", > + NR_CPUS); > + return; > + } > + > mpidr = acpi_map_madt_entry(pa->acpi_processor_uid); > if (mpidr == PHYS_CPUID_INVALID) { > pr_err("SRAT: PXM %d with ACPI ID %d has no valid MPIDR in > MADT\n", > @@ -95,7 +97,6 @@ void __init acpi_numa_gicc_affinity_init(struct > acpi_srat_gicc_affinity *pa) > > early_node_cpu_hwid[cpus_in_srat].node_id = node; > early_node_cpu_hwid[cpus_in_srat].cpu_hwid = mpidr; > - node_set(node, numa_nodes_parsed); > cpus_in_srat++; > pr_info("SRAT: PXM %d -> MPIDR 0x%Lx -> Node %d\n", > pxm, mpidr, node); > > . >
Re: [PATCH 1/2] arm64: avoid alloc memory on offline node
Hi Michal, On 2018/6/19 20:07, Michal Hocko wrote: > On Tue 19-06-18 20:03:07, Xie XiuQi wrote: > [...] >> I tested on a arm board with 128 cores 4 numa nodes, but I set >> CONFIG_NR_CPUS=72. >> Then node 3 is not be created, because node 3 has no memory, and no cpu. >> But some pci device may related to node 3, which be set in ACPI table. > > Could you double check that zonelists for node 3 are generated > correctly? > zonelists for node 3 is not created at all. Kernel parse SRAT table to create node info, but in this case, SRAT table is parsed not completed. Only the first 72 items are parsed. In SRAT table, we haven't seen node 3 information yet, because cpu_to_node_map[72] is too small. [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x3 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30001 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30002 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30003 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30100 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30101 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30102 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30103 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30200 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30201 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30202 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30203 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30300 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30301 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30302 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30303 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30400 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30401 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30402 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30403 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30500 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30501 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30502 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30503 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30600 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30601 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30602 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30603 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30700 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30701 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30702 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30703 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x1 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10001 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10002 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10003 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10100 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10101 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10102 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10103 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10200 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10201 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10202 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10203 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10300 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10301 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10302 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10303 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10400 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10401 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10402 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10403 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10500 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10501 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10502 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10503 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10600 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10601 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10602 -> Node 1
Re: [PATCH 1/2] arm64: avoid alloc memory on offline node
Hi Michal, On 2018/6/19 20:07, Michal Hocko wrote: > On Tue 19-06-18 20:03:07, Xie XiuQi wrote: > [...] >> I tested on a arm board with 128 cores 4 numa nodes, but I set >> CONFIG_NR_CPUS=72. >> Then node 3 is not be created, because node 3 has no memory, and no cpu. >> But some pci device may related to node 3, which be set in ACPI table. > > Could you double check that zonelists for node 3 are generated > correctly? > zonelists for node 3 is not created at all. Kernel parse SRAT table to create node info, but in this case, SRAT table is parsed not completed. Only the first 72 items are parsed. In SRAT table, we haven't seen node 3 information yet, because cpu_to_node_map[72] is too small. [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x3 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30001 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30002 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30003 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30100 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30101 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30102 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30103 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30200 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30201 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30202 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30203 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30300 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30301 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30302 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30303 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30400 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30401 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30402 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30403 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30500 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30501 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30502 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30503 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30600 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30601 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30602 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30603 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30700 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30701 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30702 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30703 -> Node 0 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x1 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10001 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10002 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10003 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10100 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10101 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10102 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10103 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10200 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10201 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10202 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10203 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10300 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10301 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10302 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10303 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10400 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10401 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10402 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10403 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10500 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10501 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10502 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10503 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10600 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10601 -> Node 1 [0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10602 -> Node 1
Re: [PATCH 1/2] arm64: avoid alloc memory on offline node
Hi Punit, On 2018/6/14 1:39, Punit Agrawal wrote: > Punit Agrawal writes: > > > [...] > >> >> CONFIG_HAVE_MEMORYLESS node is not enabled on arm64 which means we end >> up returning the original node in the fallback path. >> >> Xie, does the below patch help? I can submit a proper patch if this >> fixes the issue for you. >> >> -- >8 -- >> Subject: [PATCH] arm64/numa: Enable memoryless numa nodes >> >> Signed-off-by: Punit Agrawal >> --- >> arch/arm64/Kconfig | 4 >> arch/arm64/mm/numa.c | 2 ++ >> 2 files changed, 6 insertions(+) >> >> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig >> index eb2cf4938f6d..5317e9aa93ab 100644 >> --- a/arch/arm64/Kconfig >> +++ b/arch/arm64/Kconfig >> @@ -756,6 +756,10 @@ config USE_PERCPU_NUMA_NODE_ID >> def_bool y >> depends on NUMA >> >> +config HAVE_MEMORYLESS_NODES >> + def_bool y >> + depends on NUMA >> + >> config HAVE_SETUP_PER_CPU_AREA >> def_bool y >> depends on NUMA >> diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c >> index dad128ba98bf..c699dcfe93de 100644 >> --- a/arch/arm64/mm/numa.c >> +++ b/arch/arm64/mm/numa.c >> @@ -73,6 +73,8 @@ EXPORT_SYMBOL(cpumask_of_node); >> static void map_cpu_to_node(unsigned int cpu, int nid) >> { >> set_cpu_numa_node(cpu, nid); >> +set_numa_mem(local_memory_node(nid)); > > Argh, this should be > > set_cpu_numa_mem(cpu, local_memory_node(nid)); > > There is not guarantee that map_cpu_to_node() will be called on the > local cpu. > > Hanjun, Xie - can you try with the update please? I've tested this patch, but it does not help. The boot message is attached. I tested on a arm board with 128 cores 4 numa nodes, but I set CONFIG_NR_CPUS=72. Then node 3 is not be created, because node 3 has no memory, and no cpu. But some pci device may related to node 3, which be set in ACPI table. 165 /* Interface called from ACPI code to setup PCI host controller */ 166 struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) 167 { 168 int node = acpi_get_node(root->device->handle); 169 struct acpi_pci_generic_root_info *ri; 170 struct pci_bus *bus, *child; 171 struct acpi_pci_root_ops *root_ops; 172 // this node may be not created. 177 ri = kzalloc_node(sizeof(*ri), GFP_KERNEL, node); 178 if (!ri) 179 return NULL; 180 181 root_ops = kzalloc_node(sizeof(*root_ops), GFP_KERNEL, node); 182 if (!root_ops) { 183 kfree(ri); 184 return NULL; 185 } 186 187 ri->cfg = pci_acpi_setup_ecam_mapping(root); 188 if (!ri->cfg) { 189 kfree(ri); 190 kfree(root_ops); 191 return NULL; 192 } > > Thanks, > Punit > >> + >> if (nid >= 0) >> cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); >> } > > . > -- Thanks, Xie XiuQi [0.00] Booting Linux on physical CPU 0x03 [0x480fd010] [0.00] Linux version 4.16.0-rc1-00491-g204a6cc-dirty (xiexiuqi@localhost.localdomain) (gcc version 6.3.1 20170404 (Linaro GCC 6.3-2017.05)) #17 SMP PREEMPT Tue Jun 19 16:33:32 CST 2018 [0.00] earlycon: pl11 at MMIO32 0x9408 (options '') [0.00] bootconsole [pl11] enabled [0.00] efi: Getting EFI parameters from FDT: [0.00] efi: EFI v2.60 by EDK II [0.00] efi: SMBIOS 3.0=0x3eb6 ACPI 2.0=0x3971 MEMATTR=0x3b106418 [0.00] ACPI: Early table checksum verification disabled [0.00] ACPI: RSDP 0x3971 24 (v02 HISI ) [0.00] ACPI: XSDT 0x3970 74 (v01 HISI HIP08 0113) [0.00] ACPI: FACP 0x3963 000114 (v06 HISI HIP08 INTL 20151124) [0.00] ACPI: DSDT 0x395C 006A1A (v02 HISI HIP08 INTL 20170929) [0.00] ACPI: GTDT 0x3962 60 (v02 HISI HIP08 INTL 20151124) [0.00] ACPI: DBG2 0x3961 5A (v00 HISI HIP08 INTL 20151124) [0.00] ACPI: MCFG 0x3960 3C (v01 HISI HIP08 INTL 20151124) [0.00] ACPI: SLIT 0x395F 3C (v01 HISI HIP07 INTL 20151124) [0.00] ACPI: SRAT 0x395E 0009C0 (v03 HISI HIP08 INTL 20151124) [0.00] ACPI: APIC 0x395D 00286C (v04 HISI HIP08 INTL 20151124) [0.00] ACPI: IORT 0x395B 00110C (v00 HISI HIP08 INTL 20170929) [0.00] ACPI: PPTT 0x311F 003
Re: [PATCH 1/2] arm64: avoid alloc memory on offline node
Hi Punit, On 2018/6/14 1:39, Punit Agrawal wrote: > Punit Agrawal writes: > > > [...] > >> >> CONFIG_HAVE_MEMORYLESS node is not enabled on arm64 which means we end >> up returning the original node in the fallback path. >> >> Xie, does the below patch help? I can submit a proper patch if this >> fixes the issue for you. >> >> -- >8 -- >> Subject: [PATCH] arm64/numa: Enable memoryless numa nodes >> >> Signed-off-by: Punit Agrawal >> --- >> arch/arm64/Kconfig | 4 >> arch/arm64/mm/numa.c | 2 ++ >> 2 files changed, 6 insertions(+) >> >> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig >> index eb2cf4938f6d..5317e9aa93ab 100644 >> --- a/arch/arm64/Kconfig >> +++ b/arch/arm64/Kconfig >> @@ -756,6 +756,10 @@ config USE_PERCPU_NUMA_NODE_ID >> def_bool y >> depends on NUMA >> >> +config HAVE_MEMORYLESS_NODES >> + def_bool y >> + depends on NUMA >> + >> config HAVE_SETUP_PER_CPU_AREA >> def_bool y >> depends on NUMA >> diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c >> index dad128ba98bf..c699dcfe93de 100644 >> --- a/arch/arm64/mm/numa.c >> +++ b/arch/arm64/mm/numa.c >> @@ -73,6 +73,8 @@ EXPORT_SYMBOL(cpumask_of_node); >> static void map_cpu_to_node(unsigned int cpu, int nid) >> { >> set_cpu_numa_node(cpu, nid); >> +set_numa_mem(local_memory_node(nid)); > > Argh, this should be > > set_cpu_numa_mem(cpu, local_memory_node(nid)); > > There is not guarantee that map_cpu_to_node() will be called on the > local cpu. > > Hanjun, Xie - can you try with the update please? I've tested this patch, but it does not help. The boot message is attached. I tested on a arm board with 128 cores 4 numa nodes, but I set CONFIG_NR_CPUS=72. Then node 3 is not be created, because node 3 has no memory, and no cpu. But some pci device may related to node 3, which be set in ACPI table. 165 /* Interface called from ACPI code to setup PCI host controller */ 166 struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) 167 { 168 int node = acpi_get_node(root->device->handle); 169 struct acpi_pci_generic_root_info *ri; 170 struct pci_bus *bus, *child; 171 struct acpi_pci_root_ops *root_ops; 172 // this node may be not created. 177 ri = kzalloc_node(sizeof(*ri), GFP_KERNEL, node); 178 if (!ri) 179 return NULL; 180 181 root_ops = kzalloc_node(sizeof(*root_ops), GFP_KERNEL, node); 182 if (!root_ops) { 183 kfree(ri); 184 return NULL; 185 } 186 187 ri->cfg = pci_acpi_setup_ecam_mapping(root); 188 if (!ri->cfg) { 189 kfree(ri); 190 kfree(root_ops); 191 return NULL; 192 } > > Thanks, > Punit > >> + >> if (nid >= 0) >> cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); >> } > > . > -- Thanks, Xie XiuQi [0.00] Booting Linux on physical CPU 0x03 [0x480fd010] [0.00] Linux version 4.16.0-rc1-00491-g204a6cc-dirty (xiexiuqi@localhost.localdomain) (gcc version 6.3.1 20170404 (Linaro GCC 6.3-2017.05)) #17 SMP PREEMPT Tue Jun 19 16:33:32 CST 2018 [0.00] earlycon: pl11 at MMIO32 0x9408 (options '') [0.00] bootconsole [pl11] enabled [0.00] efi: Getting EFI parameters from FDT: [0.00] efi: EFI v2.60 by EDK II [0.00] efi: SMBIOS 3.0=0x3eb6 ACPI 2.0=0x3971 MEMATTR=0x3b106418 [0.00] ACPI: Early table checksum verification disabled [0.00] ACPI: RSDP 0x3971 24 (v02 HISI ) [0.00] ACPI: XSDT 0x3970 74 (v01 HISI HIP08 0113) [0.00] ACPI: FACP 0x3963 000114 (v06 HISI HIP08 INTL 20151124) [0.00] ACPI: DSDT 0x395C 006A1A (v02 HISI HIP08 INTL 20170929) [0.00] ACPI: GTDT 0x3962 60 (v02 HISI HIP08 INTL 20151124) [0.00] ACPI: DBG2 0x3961 5A (v00 HISI HIP08 INTL 20151124) [0.00] ACPI: MCFG 0x3960 3C (v01 HISI HIP08 INTL 20151124) [0.00] ACPI: SLIT 0x395F 3C (v01 HISI HIP07 INTL 20151124) [0.00] ACPI: SRAT 0x395E 0009C0 (v03 HISI HIP08 INTL 20151124) [0.00] ACPI: APIC 0x395D 00286C (v04 HISI HIP08 INTL 20151124) [0.00] ACPI: IORT 0x395B 00110C (v00 HISI HIP08 INTL 20170929) [0.00] ACPI: PPTT 0x311F 003
Re: [PATCH 1/2] arm64: avoid alloc memory on offline node
Hi Michal, On 2018/6/11 16:52, Michal Hocko wrote: > On Mon 11-06-18 11:23:18, Xie XiuQi wrote: >> Hi Michal, >> >> On 2018/6/7 20:21, Michal Hocko wrote: >>> On Thu 07-06-18 19:55:53, Hanjun Guo wrote: >>>> On 2018/6/7 18:55, Michal Hocko wrote: >>> [...] >>>>> I am not sure I have the full context but pci_acpi_scan_root calls >>>>> kzalloc_node(sizeof(*info), GFP_KERNEL, node) >>>>> and that should fall back to whatever node that is online. Offline node >>>>> shouldn't keep any pages behind. So there must be something else going >>>>> on here and the patch is not the right way to handle it. What does >>>>> faddr2line __alloc_pages_nodemask+0xf0 tells on this kernel? >>>> >>>> The whole context is: >>>> >>>> The system is booted with a NUMA node has no memory attaching to it >>>> (memory-less NUMA node), also with NR_CPUS less than CPUs presented >>>> in MADT, so CPUs on this memory-less node are not brought up, and >>>> this NUMA node will not be online (but SRAT presents this NUMA node); >>>> >>>> Devices attaching to this NUMA node such as PCI host bridge still >>>> return the valid NUMA node via _PXM, but actually that valid NUMA node >>>> is not online which lead to this issue. >>> >>> But we should have other numa nodes on the zonelists so the allocator >>> should fall back to other node. If the zonelist is not intiailized >>> properly, though, then this can indeed show up as a problem. Knowing >>> which exact place has blown up would help get a better picture... >>> >> >> I specific a non-exist node to allocate memory using kzalloc_node, >> and got this following error message. >> >> And I found out there is just a VM_WARN, but it does not prevent the memory >> allocation continue. >> >> This nid would be use to access NODE_DADA(nid), so if nid is invalid, >> it would cause oops here. >> >> 459 /* >> 460 * Allocate pages, preferring the node given as nid. The node must be >> valid and >> 461 * online. For more general interface, see alloc_pages_node(). >> 462 */ >> 463 static inline struct page * >> 464 __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) >> 465 { >> 466 VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); >> 467 VM_WARN_ON(!node_online(nid)); >> 468 >> 469 return __alloc_pages(gfp_mask, order, nid); >> 470 } >> 471 >> >> (I wrote a ko, to allocate memory on a non-exist node using kzalloc_node().) > > OK, so this is an artificialy broken code, right. You shouldn't get a > non-existent node via standard APIs AFAICS. The original report was > about an existing node which is offline AFAIU. That would be a different > case. If I am missing something and there are legitimate users that try > to allocate from non-existing nodes then we should handle that in > node_zonelist. I think hanjun's comments may help to understood this question: - NUMA node will be built if CPUs and (or) memory are valid on this NUMA node; - But if we boot the system with memory-less node and also with CONFIG_NR_CPUS less than CPUs in SRAT, for example, 64 CPUs total with 4 NUMA nodes, 16 CPUs on each NUMA node, if we boot with CONFIG_NR_CPUS=48, then we will not built numa node for node 3, but with devices on that numa node, alloc memory will be panic because NUMA node 3 is not a valid node. I triggered this BUG on arm64 platform, and I found a similar bug has been fixed on x86 platform. So I sent a similar patch for this bug. Or, could we consider to fix it in the mm subsystem? >From b755de8dfdfef97effaa91379ffafcb81f4d62a1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 20 Feb 2008 12:41:52 -0800 Subject: [PATCH] x86: make dev_to_node return online node a numa system (with multi HT chains) may return node without ram. Aka it is not online. Try to get an online node, otherwise return -1. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/pci/acpi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index d95de2f..ea8685f 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -172,6 +172,9 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do set_mp_bus_to_node(busnum, node); else node = get_mp_bus_to_node(busnum); + + if (node != -1 && !node_online(node)) + node = -1; #endif /* Allocate per-root-bus (not per bus) arch-specific data. -- 1.8.3.1 > > [...] > -- Thanks, Xie XiuQi
Re: [PATCH 1/2] arm64: avoid alloc memory on offline node
Hi Michal, On 2018/6/11 16:52, Michal Hocko wrote: > On Mon 11-06-18 11:23:18, Xie XiuQi wrote: >> Hi Michal, >> >> On 2018/6/7 20:21, Michal Hocko wrote: >>> On Thu 07-06-18 19:55:53, Hanjun Guo wrote: >>>> On 2018/6/7 18:55, Michal Hocko wrote: >>> [...] >>>>> I am not sure I have the full context but pci_acpi_scan_root calls >>>>> kzalloc_node(sizeof(*info), GFP_KERNEL, node) >>>>> and that should fall back to whatever node that is online. Offline node >>>>> shouldn't keep any pages behind. So there must be something else going >>>>> on here and the patch is not the right way to handle it. What does >>>>> faddr2line __alloc_pages_nodemask+0xf0 tells on this kernel? >>>> >>>> The whole context is: >>>> >>>> The system is booted with a NUMA node has no memory attaching to it >>>> (memory-less NUMA node), also with NR_CPUS less than CPUs presented >>>> in MADT, so CPUs on this memory-less node are not brought up, and >>>> this NUMA node will not be online (but SRAT presents this NUMA node); >>>> >>>> Devices attaching to this NUMA node such as PCI host bridge still >>>> return the valid NUMA node via _PXM, but actually that valid NUMA node >>>> is not online which lead to this issue. >>> >>> But we should have other numa nodes on the zonelists so the allocator >>> should fall back to other node. If the zonelist is not intiailized >>> properly, though, then this can indeed show up as a problem. Knowing >>> which exact place has blown up would help get a better picture... >>> >> >> I specific a non-exist node to allocate memory using kzalloc_node, >> and got this following error message. >> >> And I found out there is just a VM_WARN, but it does not prevent the memory >> allocation continue. >> >> This nid would be use to access NODE_DADA(nid), so if nid is invalid, >> it would cause oops here. >> >> 459 /* >> 460 * Allocate pages, preferring the node given as nid. The node must be >> valid and >> 461 * online. For more general interface, see alloc_pages_node(). >> 462 */ >> 463 static inline struct page * >> 464 __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) >> 465 { >> 466 VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); >> 467 VM_WARN_ON(!node_online(nid)); >> 468 >> 469 return __alloc_pages(gfp_mask, order, nid); >> 470 } >> 471 >> >> (I wrote a ko, to allocate memory on a non-exist node using kzalloc_node().) > > OK, so this is an artificialy broken code, right. You shouldn't get a > non-existent node via standard APIs AFAICS. The original report was > about an existing node which is offline AFAIU. That would be a different > case. If I am missing something and there are legitimate users that try > to allocate from non-existing nodes then we should handle that in > node_zonelist. I think hanjun's comments may help to understood this question: - NUMA node will be built if CPUs and (or) memory are valid on this NUMA node; - But if we boot the system with memory-less node and also with CONFIG_NR_CPUS less than CPUs in SRAT, for example, 64 CPUs total with 4 NUMA nodes, 16 CPUs on each NUMA node, if we boot with CONFIG_NR_CPUS=48, then we will not built numa node for node 3, but with devices on that numa node, alloc memory will be panic because NUMA node 3 is not a valid node. I triggered this BUG on arm64 platform, and I found a similar bug has been fixed on x86 platform. So I sent a similar patch for this bug. Or, could we consider to fix it in the mm subsystem? >From b755de8dfdfef97effaa91379ffafcb81f4d62a1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 20 Feb 2008 12:41:52 -0800 Subject: [PATCH] x86: make dev_to_node return online node a numa system (with multi HT chains) may return node without ram. Aka it is not online. Try to get an online node, otherwise return -1. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/pci/acpi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index d95de2f..ea8685f 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -172,6 +172,9 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do set_mp_bus_to_node(busnum, node); else node = get_mp_bus_to_node(busnum); + + if (node != -1 && !node_online(node)) + node = -1; #endif /* Allocate per-root-bus (not per bus) arch-specific data. -- 1.8.3.1 > > [...] > -- Thanks, Xie XiuQi
Re: [PATCH 1/2] arm64: avoid alloc memory on offline node
: 0041a2d8 R08: R09: 7ffd7d73f8b8 [ 120.737019] R10: 0003 R11: 0206 R12: [ 120.744123] R13: 00c20130 R14: R15: [ 120.751230] Code: 89 c6 74 0d e8 55 ab 5e 00 8b 74 24 1c 48 8b 3c 24 48 8b 54 24 08 89 d9 c1 e9 17 83 e1 01 48 85 d2 88 4c 24 20 0f 85 25 01 00 00 <3b> 77 08 0f 82 1c 01 00 00 48 89 f8 44 89 ea 48 89 e1 44 89 e6 [ 120.770020] RIP: __alloc_pages_nodemask+0x10d/0x2c0 RSP: 881196947a90 [ 120.776780] CR2: 2088 [ 120.780116] ---[ end trace 89f801c36550734f ]--- [ 120.978922] Kernel panic - not syncing: Fatal exception [ 120.984186] Kernel Offset: 0x3380 from 0xffff8100 (relocation range: 0x8000-0xbfff) [ 121.209501] ---[ end Kernel panic - not syncing: Fatal exception ]--- -- Thanks, Xie XiuQi
Re: [PATCH 1/2] arm64: avoid alloc memory on offline node
: 0041a2d8 R08: R09: 7ffd7d73f8b8 [ 120.737019] R10: 0003 R11: 0206 R12: [ 120.744123] R13: 00c20130 R14: R15: [ 120.751230] Code: 89 c6 74 0d e8 55 ab 5e 00 8b 74 24 1c 48 8b 3c 24 48 8b 54 24 08 89 d9 c1 e9 17 83 e1 01 48 85 d2 88 4c 24 20 0f 85 25 01 00 00 <3b> 77 08 0f 82 1c 01 00 00 48 89 f8 44 89 ea 48 89 e1 44 89 e6 [ 120.770020] RIP: __alloc_pages_nodemask+0x10d/0x2c0 RSP: 881196947a90 [ 120.776780] CR2: 2088 [ 120.780116] ---[ end trace 89f801c36550734f ]--- [ 120.978922] Kernel panic - not syncing: Fatal exception [ 120.984186] Kernel Offset: 0x3380 from 0xffff8100 (relocation range: 0x8000-0xbfff) [ 121.209501] ---[ end Kernel panic - not syncing: Fatal exception ]--- -- Thanks, Xie XiuQi
[PATCH 2/2] drivers: check numa node's online status in dev_to_node
If dev->numa_node is not available (or offline), we should return NUMA_NO_NODE to prevent alloc memory on offline nodes, which could cause oops. For example, a numa node: 1) without memory 2) NR_CPUS is very small, and the cpus on the node are not brought up [ 27.851041] Unable to handle kernel NULL pointer dereference at virtual address 1988 [ 27.859128] Mem abort info: [ 27.861908] ESR = 0x9605 [ 27.864949] Exception class = DABT (current EL), IL = 32 bits [ 27.870860] SET = 0, FnV = 0 [ 27.873900] EA = 0, S1PTW = 0 [ 27.877029] Data abort info: [ 27.879895] ISV = 0, ISS = 0x0005 [ 27.883716] CM = 0, WnR = 0 [ 27.886673] [1988] user address but active_mm is swapper [ 27.893012] Internal error: Oops: 9605 [#1] SMP [ 27.897876] Modules linked in: [ 27.900919] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 4.17.0-rc6-mpam+ #116 [ 27.907865] Hardware name: Huawei D06/D06, BIOS Hisilicon D06 EC UEFI Nemo 2.0 RC0 - B306 05/28/2018 [ 27.916983] pstate: 80c9 (Nzcv daif +PAN +UAO) [ 27.921763] pc : __alloc_pages_nodemask+0xf0/0xe70 [ 27.926540] lr : __alloc_pages_nodemask+0x184/0xe70 [ 27.931403] sp : 0996f7e0 [ 27.934704] x29: 0996f7e0 x28: 08cb10a0 [ 27.940003] x27: 014012c0 x26: [ 27.945301] x25: 0003 x24: 085bbc14 [ 27.950600] x23: 0040 x22: [ 27.955898] x21: 0001 x20: [ 27.961196] x19: 0040 x18: 0f00 [ 27.966494] x17: 003bff88 x16: 0020 [ 27.971792] x15: 003b x14: [ 27.977090] x13: x12: 0030 [ 27.982388] x11: 0101010101010101 x10: 7f7f7f7f7f7f7f7f [ 27.987686] x9 : 2e64716e622e7364 x8 : 7f7f7f7f7f7f7f7f [ 27.992984] x7 : x6 : 08d73c08 [ 27.998282] x5 : x4 : 0081 [ 28.003580] x3 : x2 : [ 28.008878] x1 : 0001 x0 : 1980 [ 28.014177] Process swapper/0 (pid: 1, stack limit = 0x(ptrval)) [ 28.020863] Call trace: [ 28.023296] __alloc_pages_nodemask+0xf0/0xe70 [ 28.027727] allocate_slab+0x94/0x590 [ 28.031374] new_slab+0x68/0xc8 [ 28.034502] ___slab_alloc+0x444/0x4f8 [ 28.038237] __slab_alloc+0x50/0x68 [ 28.041713] __kmalloc_node_track_caller+0x100/0x320 [ 28.046664] devm_kmalloc+0x3c/0x90 [ 28.050139] pinctrl_bind_pins+0x4c/0x298 [ 28.054135] driver_probe_device+0xb4/0x4a0 [ 28.058305] __driver_attach+0x124/0x128 [ 28.062213] bus_for_each_dev+0x78/0xe0 [ 28.066035] driver_attach+0x30/0x40 [ 28.069597] bus_add_driver+0x248/0x2b8 [ 28.073419] driver_register+0x68/0x100 [ 28.077242] __pci_register_driver+0x64/0x78 [ 28.081500] pcie_portdrv_init+0x44/0x4c [ 28.085410] do_one_initcall+0x54/0x208 [ 28.089232] kernel_init_freeable+0x244/0x340 [ 28.093577] kernel_init+0x18/0x118 [ 28.097052] ret_from_fork+0x10/0x1c [ 28.100614] Code: 7100047f 321902a4 1a950095 b5000602 (b9400803) [ 28.106740] ---[ end trace e32df44e6e1c3a4b ]--- Signed-off-by: Xie XiuQi Tested-by: Huiqiang Wang Cc: Hanjun Guo Cc: Tomasz Nowicki Cc: Xishi Qiu --- include/linux/device.h | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/device.h b/include/linux/device.h index 4779569..2a4fb08 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1017,7 +1017,12 @@ extern __printf(2, 3) #ifdef CONFIG_NUMA static inline int dev_to_node(struct device *dev) { - return dev->numa_node; + int node = dev->numa_node; + + if (unlikely(node != NUMA_NO_NODE && !node_online(node))) + return NUMA_NO_NODE; + + return node; } static inline void set_dev_node(struct device *dev, int node) { -- 1.8.3.1
[PATCH 0/2] arm64/drivers: avoid alloc memory on offline node
A numa system may return node which is not online. For example, a numa node: 1) without memory 2) NR_CPUS is very small, and the cpus on the node are not brought up In this situation, we use NUMA_NO_NODE to avoid oops. [ 25.732905] Unable to handle kernel NULL pointer dereference at virtual address 1988 [ 25.740982] Mem abort info: [ 25.743762] ESR = 0x9605 [ 25.746803] Exception class = DABT (current EL), IL = 32 bits [ 25.752711] SET = 0, FnV = 0 [ 25.755751] EA = 0, S1PTW = 0 [ 25.758878] Data abort info: [ 25.761745] ISV = 0, ISS = 0x0005 [ 25.765568] CM = 0, WnR = 0 [ 25.768521] [1988] user address but active_mm is swapper [ 25.774861] Internal error: Oops: 9605 [#1] SMP [ 25.779724] Modules linked in: [ 25.782768] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 4.17.0-rc6-mpam+ #115 [ 25.789714] Hardware name: Huawei D06/D06, BIOS Hisilicon D06 EC UEFI Nemo 2.0 RC0 - B305 05/28/2018 [ 25.798831] pstate: 80c9 (Nzcv daif +PAN +UAO) [ 25.803612] pc : __alloc_pages_nodemask+0xf0/0xe70 [ 25.808389] lr : __alloc_pages_nodemask+0x184/0xe70 [ 25.813252] sp : 0996f660 [ 25.816553] x29: 0996f660 x28: [ 25.821852] x27: 014012c0 x26: [ 25.827150] x25: 0003 x24: 08099eac [ 25.832449] x23: 0040 x22: [ 25.837747] x21: 0001 x20: [ 25.843045] x19: 0040 x18: 00010e00 [ 25.848343] x17: 0437f790 x16: 0020 [ 25.853641] x15: x14: 6549435020524541 [ 25.858939] x13: 20454d502067756c x12: [ 25.864237] x11: 0996f6f0 x10: 0006 [ 25.869536] x9 : 12a4 x8 : 8023c000ff90 [ 25.874834] x7 : x6 : 08d73c08 [ 25.880132] x5 : x4 : 0081 [ 25.885430] x3 : x2 : [ 25.890728] x1 : 0001 x0 : 1980 [ 25.896027] Process swapper/0 (pid: 1, stack limit = 0x(ptrval)) [ 25.902712] Call trace: [ 25.905146] __alloc_pages_nodemask+0xf0/0xe70 [ 25.909577] allocate_slab+0x94/0x590 [ 25.913225] new_slab+0x68/0xc8 [ 25.916353] ___slab_alloc+0x444/0x4f8 [ 25.920088] __slab_alloc+0x50/0x68 [ 25.923562] kmem_cache_alloc_node_trace+0xe8/0x230 [ 25.928426] pci_acpi_scan_root+0x94/0x278 [ 25.932510] acpi_pci_root_add+0x228/0x4b0 [ 25.936593] acpi_bus_attach+0x10c/0x218 [ 25.940501] acpi_bus_attach+0xac/0x218 [ 25.944323] acpi_bus_attach+0xac/0x218 [ 25.948144] acpi_bus_scan+0x5c/0xc0 [ 25.951708] acpi_scan_init+0xf8/0x254 [ 25.955443] acpi_init+0x310/0x37c [ 25.958831] do_one_initcall+0x54/0x208 [ 25.962653] kernel_init_freeable+0x244/0x340 [ 25.966999] kernel_init+0x18/0x118 [ 25.970474] ret_from_fork+0x10/0x1c [ 25.974036] Code: 7100047f 321902a4 1a950095 b5000602 (b9400803) [ 25.980162] ---[ end trace 64f0893eb21ec283 ]--- [ 25.984765] Kernel panic - not syncing: Fatal exception Xie XiuQi (2): arm64: avoid alloc memory on offline node drivers: check numa node's online status in dev_to_node arch/arm64/kernel/pci.c | 3 +++ include/linux/device.h | 7 ++- 2 files changed, 9 insertions(+), 1 deletion(-) -- 1.8.3.1
[PATCH 0/2] arm64/drivers: avoid alloc memory on offline node
A numa system may return node which is not online. For example, a numa node: 1) without memory 2) NR_CPUS is very small, and the cpus on the node are not brought up In this situation, we use NUMA_NO_NODE to avoid oops. [ 25.732905] Unable to handle kernel NULL pointer dereference at virtual address 1988 [ 25.740982] Mem abort info: [ 25.743762] ESR = 0x9605 [ 25.746803] Exception class = DABT (current EL), IL = 32 bits [ 25.752711] SET = 0, FnV = 0 [ 25.755751] EA = 0, S1PTW = 0 [ 25.758878] Data abort info: [ 25.761745] ISV = 0, ISS = 0x0005 [ 25.765568] CM = 0, WnR = 0 [ 25.768521] [1988] user address but active_mm is swapper [ 25.774861] Internal error: Oops: 9605 [#1] SMP [ 25.779724] Modules linked in: [ 25.782768] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 4.17.0-rc6-mpam+ #115 [ 25.789714] Hardware name: Huawei D06/D06, BIOS Hisilicon D06 EC UEFI Nemo 2.0 RC0 - B305 05/28/2018 [ 25.798831] pstate: 80c9 (Nzcv daif +PAN +UAO) [ 25.803612] pc : __alloc_pages_nodemask+0xf0/0xe70 [ 25.808389] lr : __alloc_pages_nodemask+0x184/0xe70 [ 25.813252] sp : 0996f660 [ 25.816553] x29: 0996f660 x28: [ 25.821852] x27: 014012c0 x26: [ 25.827150] x25: 0003 x24: 08099eac [ 25.832449] x23: 0040 x22: [ 25.837747] x21: 0001 x20: [ 25.843045] x19: 0040 x18: 00010e00 [ 25.848343] x17: 0437f790 x16: 0020 [ 25.853641] x15: x14: 6549435020524541 [ 25.858939] x13: 20454d502067756c x12: [ 25.864237] x11: 0996f6f0 x10: 0006 [ 25.869536] x9 : 12a4 x8 : 8023c000ff90 [ 25.874834] x7 : x6 : 08d73c08 [ 25.880132] x5 : x4 : 0081 [ 25.885430] x3 : x2 : [ 25.890728] x1 : 0001 x0 : 1980 [ 25.896027] Process swapper/0 (pid: 1, stack limit = 0x(ptrval)) [ 25.902712] Call trace: [ 25.905146] __alloc_pages_nodemask+0xf0/0xe70 [ 25.909577] allocate_slab+0x94/0x590 [ 25.913225] new_slab+0x68/0xc8 [ 25.916353] ___slab_alloc+0x444/0x4f8 [ 25.920088] __slab_alloc+0x50/0x68 [ 25.923562] kmem_cache_alloc_node_trace+0xe8/0x230 [ 25.928426] pci_acpi_scan_root+0x94/0x278 [ 25.932510] acpi_pci_root_add+0x228/0x4b0 [ 25.936593] acpi_bus_attach+0x10c/0x218 [ 25.940501] acpi_bus_attach+0xac/0x218 [ 25.944323] acpi_bus_attach+0xac/0x218 [ 25.948144] acpi_bus_scan+0x5c/0xc0 [ 25.951708] acpi_scan_init+0xf8/0x254 [ 25.955443] acpi_init+0x310/0x37c [ 25.958831] do_one_initcall+0x54/0x208 [ 25.962653] kernel_init_freeable+0x244/0x340 [ 25.966999] kernel_init+0x18/0x118 [ 25.970474] ret_from_fork+0x10/0x1c [ 25.974036] Code: 7100047f 321902a4 1a950095 b5000602 (b9400803) [ 25.980162] ---[ end trace 64f0893eb21ec283 ]--- [ 25.984765] Kernel panic - not syncing: Fatal exception Xie XiuQi (2): arm64: avoid alloc memory on offline node drivers: check numa node's online status in dev_to_node arch/arm64/kernel/pci.c | 3 +++ include/linux/device.h | 7 ++- 2 files changed, 9 insertions(+), 1 deletion(-) -- 1.8.3.1
[PATCH 2/2] drivers: check numa node's online status in dev_to_node
If dev->numa_node is not available (or offline), we should return NUMA_NO_NODE to prevent alloc memory on offline nodes, which could cause oops. For example, a numa node: 1) without memory 2) NR_CPUS is very small, and the cpus on the node are not brought up [ 27.851041] Unable to handle kernel NULL pointer dereference at virtual address 1988 [ 27.859128] Mem abort info: [ 27.861908] ESR = 0x9605 [ 27.864949] Exception class = DABT (current EL), IL = 32 bits [ 27.870860] SET = 0, FnV = 0 [ 27.873900] EA = 0, S1PTW = 0 [ 27.877029] Data abort info: [ 27.879895] ISV = 0, ISS = 0x0005 [ 27.883716] CM = 0, WnR = 0 [ 27.886673] [1988] user address but active_mm is swapper [ 27.893012] Internal error: Oops: 9605 [#1] SMP [ 27.897876] Modules linked in: [ 27.900919] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 4.17.0-rc6-mpam+ #116 [ 27.907865] Hardware name: Huawei D06/D06, BIOS Hisilicon D06 EC UEFI Nemo 2.0 RC0 - B306 05/28/2018 [ 27.916983] pstate: 80c9 (Nzcv daif +PAN +UAO) [ 27.921763] pc : __alloc_pages_nodemask+0xf0/0xe70 [ 27.926540] lr : __alloc_pages_nodemask+0x184/0xe70 [ 27.931403] sp : 0996f7e0 [ 27.934704] x29: 0996f7e0 x28: 08cb10a0 [ 27.940003] x27: 014012c0 x26: [ 27.945301] x25: 0003 x24: 085bbc14 [ 27.950600] x23: 0040 x22: [ 27.955898] x21: 0001 x20: [ 27.961196] x19: 0040 x18: 0f00 [ 27.966494] x17: 003bff88 x16: 0020 [ 27.971792] x15: 003b x14: [ 27.977090] x13: x12: 0030 [ 27.982388] x11: 0101010101010101 x10: 7f7f7f7f7f7f7f7f [ 27.987686] x9 : 2e64716e622e7364 x8 : 7f7f7f7f7f7f7f7f [ 27.992984] x7 : x6 : 08d73c08 [ 27.998282] x5 : x4 : 0081 [ 28.003580] x3 : x2 : [ 28.008878] x1 : 0001 x0 : 1980 [ 28.014177] Process swapper/0 (pid: 1, stack limit = 0x(ptrval)) [ 28.020863] Call trace: [ 28.023296] __alloc_pages_nodemask+0xf0/0xe70 [ 28.027727] allocate_slab+0x94/0x590 [ 28.031374] new_slab+0x68/0xc8 [ 28.034502] ___slab_alloc+0x444/0x4f8 [ 28.038237] __slab_alloc+0x50/0x68 [ 28.041713] __kmalloc_node_track_caller+0x100/0x320 [ 28.046664] devm_kmalloc+0x3c/0x90 [ 28.050139] pinctrl_bind_pins+0x4c/0x298 [ 28.054135] driver_probe_device+0xb4/0x4a0 [ 28.058305] __driver_attach+0x124/0x128 [ 28.062213] bus_for_each_dev+0x78/0xe0 [ 28.066035] driver_attach+0x30/0x40 [ 28.069597] bus_add_driver+0x248/0x2b8 [ 28.073419] driver_register+0x68/0x100 [ 28.077242] __pci_register_driver+0x64/0x78 [ 28.081500] pcie_portdrv_init+0x44/0x4c [ 28.085410] do_one_initcall+0x54/0x208 [ 28.089232] kernel_init_freeable+0x244/0x340 [ 28.093577] kernel_init+0x18/0x118 [ 28.097052] ret_from_fork+0x10/0x1c [ 28.100614] Code: 7100047f 321902a4 1a950095 b5000602 (b9400803) [ 28.106740] ---[ end trace e32df44e6e1c3a4b ]--- Signed-off-by: Xie XiuQi Tested-by: Huiqiang Wang Cc: Hanjun Guo Cc: Tomasz Nowicki Cc: Xishi Qiu --- include/linux/device.h | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/device.h b/include/linux/device.h index 4779569..2a4fb08 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1017,7 +1017,12 @@ extern __printf(2, 3) #ifdef CONFIG_NUMA static inline int dev_to_node(struct device *dev) { - return dev->numa_node; + int node = dev->numa_node; + + if (unlikely(node != NUMA_NO_NODE && !node_online(node))) + return NUMA_NO_NODE; + + return node; } static inline void set_dev_node(struct device *dev, int node) { -- 1.8.3.1
[PATCH 1/2] arm64: avoid alloc memory on offline node
A numa system may return node which is not online. For example, a numa node: 1) without memory 2) NR_CPUS is very small, and the cpus on the node are not brought up In this situation, we use NUMA_NO_NODE to avoid oops. [ 25.732905] Unable to handle kernel NULL pointer dereference at virtual address 1988 [ 25.740982] Mem abort info: [ 25.743762] ESR = 0x9605 [ 25.746803] Exception class = DABT (current EL), IL = 32 bits [ 25.752711] SET = 0, FnV = 0 [ 25.755751] EA = 0, S1PTW = 0 [ 25.758878] Data abort info: [ 25.761745] ISV = 0, ISS = 0x0005 [ 25.765568] CM = 0, WnR = 0 [ 25.768521] [1988] user address but active_mm is swapper [ 25.774861] Internal error: Oops: 9605 [#1] SMP [ 25.779724] Modules linked in: [ 25.782768] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 4.17.0-rc6-mpam+ #115 [ 25.789714] Hardware name: Huawei D06/D06, BIOS Hisilicon D06 EC UEFI Nemo 2.0 RC0 - B305 05/28/2018 [ 25.798831] pstate: 80c9 (Nzcv daif +PAN +UAO) [ 25.803612] pc : __alloc_pages_nodemask+0xf0/0xe70 [ 25.808389] lr : __alloc_pages_nodemask+0x184/0xe70 [ 25.813252] sp : 0996f660 [ 25.816553] x29: 0996f660 x28: [ 25.821852] x27: 014012c0 x26: [ 25.827150] x25: 0003 x24: 08099eac [ 25.832449] x23: 0040 x22: [ 25.837747] x21: 0001 x20: [ 25.843045] x19: 0040 x18: 00010e00 [ 25.848343] x17: 0437f790 x16: 0020 [ 25.853641] x15: x14: 6549435020524541 [ 25.858939] x13: 20454d502067756c x12: [ 25.864237] x11: 0996f6f0 x10: 0006 [ 25.869536] x9 : 12a4 x8 : 8023c000ff90 [ 25.874834] x7 : x6 : 08d73c08 [ 25.880132] x5 : x4 : 0081 [ 25.885430] x3 : x2 : [ 25.890728] x1 : 0001 x0 : 1980 [ 25.896027] Process swapper/0 (pid: 1, stack limit = 0x(ptrval)) [ 25.902712] Call trace: [ 25.905146] __alloc_pages_nodemask+0xf0/0xe70 [ 25.909577] allocate_slab+0x94/0x590 [ 25.913225] new_slab+0x68/0xc8 [ 25.916353] ___slab_alloc+0x444/0x4f8 [ 25.920088] __slab_alloc+0x50/0x68 [ 25.923562] kmem_cache_alloc_node_trace+0xe8/0x230 [ 25.928426] pci_acpi_scan_root+0x94/0x278 [ 25.932510] acpi_pci_root_add+0x228/0x4b0 [ 25.936593] acpi_bus_attach+0x10c/0x218 [ 25.940501] acpi_bus_attach+0xac/0x218 [ 25.944323] acpi_bus_attach+0xac/0x218 [ 25.948144] acpi_bus_scan+0x5c/0xc0 [ 25.951708] acpi_scan_init+0xf8/0x254 [ 25.955443] acpi_init+0x310/0x37c [ 25.958831] do_one_initcall+0x54/0x208 [ 25.962653] kernel_init_freeable+0x244/0x340 [ 25.966999] kernel_init+0x18/0x118 [ 25.970474] ret_from_fork+0x10/0x1c [ 25.974036] Code: 7100047f 321902a4 1a950095 b5000602 (b9400803) [ 25.980162] ---[ end trace 64f0893eb21ec283 ]--- [ 25.984765] Kernel panic - not syncing: Fatal exception Signed-off-by: Xie XiuQi Tested-by: Huiqiang Wang Cc: Hanjun Guo Cc: Tomasz Nowicki Cc: Xishi Qiu --- arch/arm64/kernel/pci.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c index 0e2ea1c..e17cc45 100644 --- a/arch/arm64/kernel/pci.c +++ b/arch/arm64/kernel/pci.c @@ -170,6 +170,9 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) struct pci_bus *bus, *child; struct acpi_pci_root_ops *root_ops; + if (node != NUMA_NO_NODE && !node_online(node)) + node = NUMA_NO_NODE; + ri = kzalloc_node(sizeof(*ri), GFP_KERNEL, node); if (!ri) return NULL; -- 1.8.3.1
[PATCH 1/2] arm64: avoid alloc memory on offline node
A numa system may return node which is not online. For example, a numa node: 1) without memory 2) NR_CPUS is very small, and the cpus on the node are not brought up In this situation, we use NUMA_NO_NODE to avoid oops. [ 25.732905] Unable to handle kernel NULL pointer dereference at virtual address 1988 [ 25.740982] Mem abort info: [ 25.743762] ESR = 0x9605 [ 25.746803] Exception class = DABT (current EL), IL = 32 bits [ 25.752711] SET = 0, FnV = 0 [ 25.755751] EA = 0, S1PTW = 0 [ 25.758878] Data abort info: [ 25.761745] ISV = 0, ISS = 0x0005 [ 25.765568] CM = 0, WnR = 0 [ 25.768521] [1988] user address but active_mm is swapper [ 25.774861] Internal error: Oops: 9605 [#1] SMP [ 25.779724] Modules linked in: [ 25.782768] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 4.17.0-rc6-mpam+ #115 [ 25.789714] Hardware name: Huawei D06/D06, BIOS Hisilicon D06 EC UEFI Nemo 2.0 RC0 - B305 05/28/2018 [ 25.798831] pstate: 80c9 (Nzcv daif +PAN +UAO) [ 25.803612] pc : __alloc_pages_nodemask+0xf0/0xe70 [ 25.808389] lr : __alloc_pages_nodemask+0x184/0xe70 [ 25.813252] sp : 0996f660 [ 25.816553] x29: 0996f660 x28: [ 25.821852] x27: 014012c0 x26: [ 25.827150] x25: 0003 x24: 08099eac [ 25.832449] x23: 0040 x22: [ 25.837747] x21: 0001 x20: [ 25.843045] x19: 0040 x18: 00010e00 [ 25.848343] x17: 0437f790 x16: 0020 [ 25.853641] x15: x14: 6549435020524541 [ 25.858939] x13: 20454d502067756c x12: [ 25.864237] x11: 0996f6f0 x10: 0006 [ 25.869536] x9 : 12a4 x8 : 8023c000ff90 [ 25.874834] x7 : x6 : 08d73c08 [ 25.880132] x5 : x4 : 0081 [ 25.885430] x3 : x2 : [ 25.890728] x1 : 0001 x0 : 1980 [ 25.896027] Process swapper/0 (pid: 1, stack limit = 0x(ptrval)) [ 25.902712] Call trace: [ 25.905146] __alloc_pages_nodemask+0xf0/0xe70 [ 25.909577] allocate_slab+0x94/0x590 [ 25.913225] new_slab+0x68/0xc8 [ 25.916353] ___slab_alloc+0x444/0x4f8 [ 25.920088] __slab_alloc+0x50/0x68 [ 25.923562] kmem_cache_alloc_node_trace+0xe8/0x230 [ 25.928426] pci_acpi_scan_root+0x94/0x278 [ 25.932510] acpi_pci_root_add+0x228/0x4b0 [ 25.936593] acpi_bus_attach+0x10c/0x218 [ 25.940501] acpi_bus_attach+0xac/0x218 [ 25.944323] acpi_bus_attach+0xac/0x218 [ 25.948144] acpi_bus_scan+0x5c/0xc0 [ 25.951708] acpi_scan_init+0xf8/0x254 [ 25.955443] acpi_init+0x310/0x37c [ 25.958831] do_one_initcall+0x54/0x208 [ 25.962653] kernel_init_freeable+0x244/0x340 [ 25.966999] kernel_init+0x18/0x118 [ 25.970474] ret_from_fork+0x10/0x1c [ 25.974036] Code: 7100047f 321902a4 1a950095 b5000602 (b9400803) [ 25.980162] ---[ end trace 64f0893eb21ec283 ]--- [ 25.984765] Kernel panic - not syncing: Fatal exception Signed-off-by: Xie XiuQi Tested-by: Huiqiang Wang Cc: Hanjun Guo Cc: Tomasz Nowicki Cc: Xishi Qiu --- arch/arm64/kernel/pci.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c index 0e2ea1c..e17cc45 100644 --- a/arch/arm64/kernel/pci.c +++ b/arch/arm64/kernel/pci.c @@ -170,6 +170,9 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) struct pci_bus *bus, *child; struct acpi_pci_root_ops *root_ops; + if (node != NUMA_NO_NODE && !node_online(node)) + node = NUMA_NO_NODE; + ri = kzalloc_node(sizeof(*ri), GFP_KERNEL, node); if (!ri) return NULL; -- 1.8.3.1
Re: [PATCH v5 2/3] GHES: add a notify chain for process memory section
Hi Boris, Thanks for your comments. On 2018/2/7 18:31, Borislav Petkov wrote: > On Fri, Jan 26, 2018 at 08:31:24PM +0800, Xie XiuQi wrote: >> Add a notify chain for process memory section, with >> which other modules might do error recovery. >> >> Signed-off-by: Xie XiuQi <xiexi...@huawei.com> >> Tested-by: Wang Xiongfeng <wangxiongfe...@huawei.com> >> Tested-by: Tyler Baicar <tbai...@codeaurora.org> >> --- >> drivers/acpi/apei/ghes.c | 10 ++ >> include/acpi/ghes.h | 8 >> 2 files changed, 18 insertions(+) >> >> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c >> index cff671d..1f0ebfb 100644 >> --- a/drivers/acpi/apei/ghes.c >> +++ b/drivers/acpi/apei/ghes.c >> @@ -109,6 +109,9 @@ static inline bool is_hest_type_generic_v2(struct ghes >> *ghes) >> static LIST_HEAD(ghes_hed); >> static DEFINE_MUTEX(ghes_list_mutex); >> >> +ATOMIC_NOTIFIER_HEAD(ghes_mem_err_chain); >> +EXPORT_SYMBOL(ghes_mem_err_chain); > > Definitely not EXPORT_SYMBOL. > > And certainly not export the notifier head. Have register and unregister > functions instead. > > That is, *if* you can't solve it differently with James. Notifiers > should be avoided if possible. > Yes, I saw the patchset, I agree. I will work with James to solve the problem. -- Thanks, Xie XiuQi
Re: [PATCH v5 2/3] GHES: add a notify chain for process memory section
Hi Boris, Thanks for your comments. On 2018/2/7 18:31, Borislav Petkov wrote: > On Fri, Jan 26, 2018 at 08:31:24PM +0800, Xie XiuQi wrote: >> Add a notify chain for process memory section, with >> which other modules might do error recovery. >> >> Signed-off-by: Xie XiuQi >> Tested-by: Wang Xiongfeng >> Tested-by: Tyler Baicar >> --- >> drivers/acpi/apei/ghes.c | 10 ++ >> include/acpi/ghes.h | 8 >> 2 files changed, 18 insertions(+) >> >> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c >> index cff671d..1f0ebfb 100644 >> --- a/drivers/acpi/apei/ghes.c >> +++ b/drivers/acpi/apei/ghes.c >> @@ -109,6 +109,9 @@ static inline bool is_hest_type_generic_v2(struct ghes >> *ghes) >> static LIST_HEAD(ghes_hed); >> static DEFINE_MUTEX(ghes_list_mutex); >> >> +ATOMIC_NOTIFIER_HEAD(ghes_mem_err_chain); >> +EXPORT_SYMBOL(ghes_mem_err_chain); > > Definitely not EXPORT_SYMBOL. > > And certainly not export the notifier head. Have register and unregister > functions instead. > > That is, *if* you can't solve it differently with James. Notifiers > should be avoided if possible. > Yes, I saw the patchset, I agree. I will work with James to solve the problem. -- Thanks, Xie XiuQi
Re: [PATCH v5 1/3] arm64/ras: support sea error recovery
Hi James, Sorry for reply late. On 2018/2/8 3:03, James Morse wrote: > Hi Xie XiuQi, > > On 30/01/18 19:19, James Morse wrote: >> On 26/01/18 12:31, Xie XiuQi wrote: >>> With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors >>> are consumed. According to the existing process, errors occurred in the >>> kernel, leading to direct panic, if it occurred the user-space, we should >>> just kill process. >>> >>> But there is a class of error, in fact, is not necessary to kill >>> process, you can recover and continue to run the process. Such as >>> the instruction data corrupted, where the memory page might be >>> read-only, which is has not been modified, the disk might have the >>> correct data, so you can directly drop the page, ant reload it when >>> necessary. >> >> With firmware-first support, we do all this... >> >> >>> So this patchset is just try to solve such problem: if the error is >>> consumed in user-space and the error occurs on a clean page, you can >>> directly drop the memory page without killing process. >>> >>> If the corrupted page is clean, just dropped it and return to user-space >>> without side effects. And if corrupted page is dirty, memory_failure() >>> will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset, >>> do_sea() will just send SIGBUS, so the process was killed in the same place. >> >> ... but this happens too. I agree its something we should fix, but I don't >> think >> this is the best way to do it. >> >> This series is pulling the memory-failure-queue details back into the >> arch-code >> to build a second list, that gets processed as extra work when we return to >> user-space. >> >> >> The root of the issue is ghes_notify_sea() claims the notification as >> something >> APEI has dealt with, ... but it hasn't done it yet. The signals will be >> generated by something currently stuck in a queue. (Evidently x86 doesn't >> handle >> synchronous errors like this using firmware-first). >> >> I think a smaller fix is to give the queues that may be holding the >> memory_failure() work a kick as part of the code that calls >> ghes_notify_sea(). >> This means that by the time we return to do_sea() ghes_notify_sea()'s claim >> that >> APEI has dealt with it is true as any generated signals are pending. We can >> then >> skip the existing SIGBUS generation code. >> >> >>> Because memory_failure() may sleep, we can not call it directly in SEA >> >> (this one is more serious, I've attempted to fix it by moving all NMI-like >> GHES-notifications to use the estatus queue). >> >> >>> exception context. So we saved faulting physical address associated with >>> a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return >>> from SEA exception context and get into do_notify_resume() before the >>> process running, we could check it and call memory_failure() to do >>> recovery. >> >>> It's safe, because we are in process context. >> >> I think this is the trick. When we take a Synchronous-external-abort out of >> userspace, we're in process context too. We can add helpers to drain the >> memory_failure_queue which can be called when do_sea() when we know we're >> preemptible and interrupts-et-al are unmasked. > > Something like... base on [0], in arch/arm64/kernel/acpi.c: I am very glad that you are trying to solve the problem, which is very helpful. I agree with your proposal, and I'll test it on by box latter. Indeed, we're in precess context when we are in sea handler. I was thought we can't call schedule() in the exception handler before. Thank you very much! > -%<- > int apei_claim_sea(struct pt_regs *regs) > { > int cpu; > int err = -ENOENT; > unsigned long current_flags = arch_local_save_flags(); > unsigned long interrupted_flags = current_flags; > > if (!IS_ENABLED(CONFIG_ACPI_APEI_SEA)) > return err; > > if (regs) > interrupted_flags = regs->pstate; > > /* > * APEI expects an NMI-like notification to always be called > * in NMI context. > */ > local_daif_restore(DAIF_ERRCTX); > nmi_enter(); > err = ghes_notify_sea(); > cpu = smp_processor_id(); > nmi_exit(); > > /* > * APEI NMI-like notifications are deferred to irq_work. Unless
Re: [PATCH v5 1/3] arm64/ras: support sea error recovery
Hi James, Sorry for reply late. On 2018/2/8 3:03, James Morse wrote: > Hi Xie XiuQi, > > On 30/01/18 19:19, James Morse wrote: >> On 26/01/18 12:31, Xie XiuQi wrote: >>> With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors >>> are consumed. According to the existing process, errors occurred in the >>> kernel, leading to direct panic, if it occurred the user-space, we should >>> just kill process. >>> >>> But there is a class of error, in fact, is not necessary to kill >>> process, you can recover and continue to run the process. Such as >>> the instruction data corrupted, where the memory page might be >>> read-only, which is has not been modified, the disk might have the >>> correct data, so you can directly drop the page, ant reload it when >>> necessary. >> >> With firmware-first support, we do all this... >> >> >>> So this patchset is just try to solve such problem: if the error is >>> consumed in user-space and the error occurs on a clean page, you can >>> directly drop the memory page without killing process. >>> >>> If the corrupted page is clean, just dropped it and return to user-space >>> without side effects. And if corrupted page is dirty, memory_failure() >>> will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset, >>> do_sea() will just send SIGBUS, so the process was killed in the same place. >> >> ... but this happens too. I agree its something we should fix, but I don't >> think >> this is the best way to do it. >> >> This series is pulling the memory-failure-queue details back into the >> arch-code >> to build a second list, that gets processed as extra work when we return to >> user-space. >> >> >> The root of the issue is ghes_notify_sea() claims the notification as >> something >> APEI has dealt with, ... but it hasn't done it yet. The signals will be >> generated by something currently stuck in a queue. (Evidently x86 doesn't >> handle >> synchronous errors like this using firmware-first). >> >> I think a smaller fix is to give the queues that may be holding the >> memory_failure() work a kick as part of the code that calls >> ghes_notify_sea(). >> This means that by the time we return to do_sea() ghes_notify_sea()'s claim >> that >> APEI has dealt with it is true as any generated signals are pending. We can >> then >> skip the existing SIGBUS generation code. >> >> >>> Because memory_failure() may sleep, we can not call it directly in SEA >> >> (this one is more serious, I've attempted to fix it by moving all NMI-like >> GHES-notifications to use the estatus queue). >> >> >>> exception context. So we saved faulting physical address associated with >>> a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return >>> from SEA exception context and get into do_notify_resume() before the >>> process running, we could check it and call memory_failure() to do >>> recovery. >> >>> It's safe, because we are in process context. >> >> I think this is the trick. When we take a Synchronous-external-abort out of >> userspace, we're in process context too. We can add helpers to drain the >> memory_failure_queue which can be called when do_sea() when we know we're >> preemptible and interrupts-et-al are unmasked. > > Something like... base on [0], in arch/arm64/kernel/acpi.c: I am very glad that you are trying to solve the problem, which is very helpful. I agree with your proposal, and I'll test it on by box latter. Indeed, we're in precess context when we are in sea handler. I was thought we can't call schedule() in the exception handler before. Thank you very much! > -%<- > int apei_claim_sea(struct pt_regs *regs) > { > int cpu; > int err = -ENOENT; > unsigned long current_flags = arch_local_save_flags(); > unsigned long interrupted_flags = current_flags; > > if (!IS_ENABLED(CONFIG_ACPI_APEI_SEA)) > return err; > > if (regs) > interrupted_flags = regs->pstate; > > /* > * APEI expects an NMI-like notification to always be called > * in NMI context. > */ > local_daif_restore(DAIF_ERRCTX); > nmi_enter(); > err = ghes_notify_sea(); > cpu = smp_processor_id(); > nmi_exit(); > > /* > * APEI NMI-like notifications are deferred to irq_work. Unless
[PATCH v5 1/3] arm64/ras: support sea error recovery
With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors are consumed. According to the existing process, errors occurred in the kernel, leading to direct panic, if it occurred the user-space, we should just kill process. But there is a class of error, in fact, is not necessary to kill process, you can recover and continue to run the process. Such as the instruction data corrupted, where the memory page might be read-only, which is has not been modified, the disk might have the correct data, so you can directly drop the page, ant reload it when necessary. So this patchset is just try to solve such problem: if the error is consumed in user-space and the error occurs on a clean page, you can directly drop the memory page without killing process. If the corrupted page is clean, just dropped it and return to user-space without side effects. And if corrupted page is dirty, memory_failure() will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset, do_sea() will just send SIGBUS, so the process was killed in the same place. Because memory_failure() may sleep, we can not call it directly in SEA exception context. So we saved faulting physical address associated with a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return from SEA exception context and get into do_notify_resume() before the process running, we could check it and call memory_failure() to do recovery. It's safe, because we are in process context. Signed-off-by: Xie XiuQi <xiexi...@huawei.com> Tested-by: Wang Xiongfeng <wangxiongfe...@huawei.com> Tested-by: Tyler Baicar <tbai...@codeaurora.org> Cc: James Morse <james.mo...@arm.com> Cc: Borislav Petkov <b...@suse.de> Cc: Julien Thierry <julien.thie...@arm.com> Cc: Hanjun Guo <hanjun@linaro.org> --- arch/arm64/Kconfig | 11 +++ arch/arm64/include/asm/ras.h | 23 ++ arch/arm64/include/asm/thread_info.h | 4 +- arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/ras.c | 142 +++ arch/arm64/kernel/signal.c | 7 ++ arch/arm64/mm/fault.c| 27 +-- drivers/acpi/apei/ghes.c | 8 +- include/acpi/ghes.h | 3 + 9 files changed, 216 insertions(+), 10 deletions(-) create mode 100644 arch/arm64/include/asm/ras.h create mode 100644 arch/arm64/kernel/ras.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c9a7e9e..8892923 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -688,6 +688,17 @@ config HOTPLUG_CPU Say Y here to experiment with turning CPUs off and on. CPUs can be controlled through /sys/devices/system/cpu. +config ARM64_ERR_RECOV + bool "Support arm64 RAS error recovery" + depends on ACPI_APEI_SEA && MEMORY_FAILURE + help + With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors + are consumed. In some cases, if the error address is in a clean page or a + read-only page, there is a chance to recover. Such as error occurs in a + instruction page, we can reread this page from disk instead of killing process. + + Say Y if unsure. + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support" diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h new file mode 100644 index 000..f0f18da --- /dev/null +++ b/arch/arm64/include/asm/ras.h @@ -0,0 +1,23 @@ +/* + * ARM64 SEA error recoery support + * + * Copyright 2017 Huawei Technologies Co., Ltd. + * Author: Xie XiuQi <xiexi...@huawei.com> + * Author: Wang Xiongfeng <wangxiongfe...@huawei.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation; + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _ASM_RAS_H +#define _ASM_RAS_H + +extern void sea_notify_process(void); + +#endif /*_ASM_RAS_H*/ diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index eb43128..250d769 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -84,6 +84,7 @@ struct thread_info { #define TIF_FOREIGN_FPSTATE3 /* CPU's FP state is not current's */ #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ #define TIF_FSCHECK5 /* Check FS is USER_DS on return */ +#define TIF_SEA_NOTIFY 6 /* notify to do an error recovery */ #define TIF_NOHZ 7 #define TIF_SYSCALL_TRACE 8 #define TIF_SYSCALL_AUDIT
[PATCH v5 1/3] arm64/ras: support sea error recovery
With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors are consumed. According to the existing process, errors occurred in the kernel, leading to direct panic, if it occurred the user-space, we should just kill process. But there is a class of error, in fact, is not necessary to kill process, you can recover and continue to run the process. Such as the instruction data corrupted, where the memory page might be read-only, which is has not been modified, the disk might have the correct data, so you can directly drop the page, ant reload it when necessary. So this patchset is just try to solve such problem: if the error is consumed in user-space and the error occurs on a clean page, you can directly drop the memory page without killing process. If the corrupted page is clean, just dropped it and return to user-space without side effects. And if corrupted page is dirty, memory_failure() will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset, do_sea() will just send SIGBUS, so the process was killed in the same place. Because memory_failure() may sleep, we can not call it directly in SEA exception context. So we saved faulting physical address associated with a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return from SEA exception context and get into do_notify_resume() before the process running, we could check it and call memory_failure() to do recovery. It's safe, because we are in process context. Signed-off-by: Xie XiuQi Tested-by: Wang Xiongfeng Tested-by: Tyler Baicar Cc: James Morse Cc: Borislav Petkov Cc: Julien Thierry Cc: Hanjun Guo --- arch/arm64/Kconfig | 11 +++ arch/arm64/include/asm/ras.h | 23 ++ arch/arm64/include/asm/thread_info.h | 4 +- arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/ras.c | 142 +++ arch/arm64/kernel/signal.c | 7 ++ arch/arm64/mm/fault.c| 27 +-- drivers/acpi/apei/ghes.c | 8 +- include/acpi/ghes.h | 3 + 9 files changed, 216 insertions(+), 10 deletions(-) create mode 100644 arch/arm64/include/asm/ras.h create mode 100644 arch/arm64/kernel/ras.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c9a7e9e..8892923 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -688,6 +688,17 @@ config HOTPLUG_CPU Say Y here to experiment with turning CPUs off and on. CPUs can be controlled through /sys/devices/system/cpu. +config ARM64_ERR_RECOV + bool "Support arm64 RAS error recovery" + depends on ACPI_APEI_SEA && MEMORY_FAILURE + help + With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors + are consumed. In some cases, if the error address is in a clean page or a + read-only page, there is a chance to recover. Such as error occurs in a + instruction page, we can reread this page from disk instead of killing process. + + Say Y if unsure. + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support" diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h new file mode 100644 index 000..f0f18da --- /dev/null +++ b/arch/arm64/include/asm/ras.h @@ -0,0 +1,23 @@ +/* + * ARM64 SEA error recoery support + * + * Copyright 2017 Huawei Technologies Co., Ltd. + * Author: Xie XiuQi + * Author: Wang Xiongfeng + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation; + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _ASM_RAS_H +#define _ASM_RAS_H + +extern void sea_notify_process(void); + +#endif /*_ASM_RAS_H*/ diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index eb43128..250d769 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -84,6 +84,7 @@ struct thread_info { #define TIF_FOREIGN_FPSTATE3 /* CPU's FP state is not current's */ #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ #define TIF_FSCHECK5 /* Check FS is USER_DS on return */ +#define TIF_SEA_NOTIFY 6 /* notify to do an error recovery */ #define TIF_NOHZ 7 #define TIF_SYSCALL_TRACE 8 #define TIF_SYSCALL_AUDIT 9 @@ -102,6 +103,7 @@ struct thread_info { #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE) #define _TIF_NOHZ (1 << TIF_NOHZ) +#define _TIF_SEA_NOTIFY (1
[PATCH v5 2/3] GHES: add a notify chain for process memory section
Add a notify chain for process memory section, with which other modules might do error recovery. Signed-off-by: Xie XiuQi <xiexi...@huawei.com> Tested-by: Wang Xiongfeng <wangxiongfe...@huawei.com> Tested-by: Tyler Baicar <tbai...@codeaurora.org> --- drivers/acpi/apei/ghes.c | 10 ++ include/acpi/ghes.h | 8 2 files changed, 18 insertions(+) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index cff671d..1f0ebfb 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -109,6 +109,9 @@ static inline bool is_hest_type_generic_v2(struct ghes *ghes) static LIST_HEAD(ghes_hed); static DEFINE_MUTEX(ghes_list_mutex); +ATOMIC_NOTIFIER_HEAD(ghes_mem_err_chain); +EXPORT_SYMBOL(ghes_mem_err_chain); + /* * Because the memory area used to transfer hardware error information * from BIOS to Linux can be determined only in NMI, IRQ or timer @@ -441,6 +444,13 @@ static void ghes_do_proc(struct ghes *ghes, if (guid_equal(sec_type, _SEC_PLATFORM_MEM)) { struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); + struct ghes_mem_err mem; + + mem.notify_type = ghes->generic->notify.type; + mem.severity = gdata->error_severity; + mem.mem_err = mem_err; + + atomic_notifier_call_chain(_mem_err_chain, 0, ); ghes_edac_report_mem_error(ghes, sev, mem_err); diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h index d626367..921ebf1 100644 --- a/include/acpi/ghes.h +++ b/include/acpi/ghes.h @@ -124,4 +124,12 @@ static inline void *acpi_hest_get_next(struct acpi_hest_generic_data *gdata) extern void ghes_arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm *err); +struct ghes_mem_err { + int notify_type; + int severity; + struct cper_sec_mem_err *mem_err; +}; + +extern struct atomic_notifier_head ghes_mem_err_chain; + #endif /* GHES_H */ -- 1.8.3.1
[PATCH v5 2/3] GHES: add a notify chain for process memory section
Add a notify chain for process memory section, with which other modules might do error recovery. Signed-off-by: Xie XiuQi Tested-by: Wang Xiongfeng Tested-by: Tyler Baicar --- drivers/acpi/apei/ghes.c | 10 ++ include/acpi/ghes.h | 8 2 files changed, 18 insertions(+) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index cff671d..1f0ebfb 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -109,6 +109,9 @@ static inline bool is_hest_type_generic_v2(struct ghes *ghes) static LIST_HEAD(ghes_hed); static DEFINE_MUTEX(ghes_list_mutex); +ATOMIC_NOTIFIER_HEAD(ghes_mem_err_chain); +EXPORT_SYMBOL(ghes_mem_err_chain); + /* * Because the memory area used to transfer hardware error information * from BIOS to Linux can be determined only in NMI, IRQ or timer @@ -441,6 +444,13 @@ static void ghes_do_proc(struct ghes *ghes, if (guid_equal(sec_type, _SEC_PLATFORM_MEM)) { struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); + struct ghes_mem_err mem; + + mem.notify_type = ghes->generic->notify.type; + mem.severity = gdata->error_severity; + mem.mem_err = mem_err; + + atomic_notifier_call_chain(_mem_err_chain, 0, ); ghes_edac_report_mem_error(ghes, sev, mem_err); diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h index d626367..921ebf1 100644 --- a/include/acpi/ghes.h +++ b/include/acpi/ghes.h @@ -124,4 +124,12 @@ static inline void *acpi_hest_get_next(struct acpi_hest_generic_data *gdata) extern void ghes_arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm *err); +struct ghes_mem_err { + int notify_type; + int severity; + struct cper_sec_mem_err *mem_err; +}; + +extern struct atomic_notifier_head ghes_mem_err_chain; + #endif /* GHES_H */ -- 1.8.3.1
[PATCH v5 0/3] arm64/ras: support sea error recovery
With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors are consumed. According to the existing process, errors occurred in the kernel, leading to direct panic, if it occurred the user-space, we should just kill process. But there is a class of error, in fact, is not necessary to kill process, you can recover and continue to run the process. Such as the instruction data corrupted, where the memory page might be read-only, which is has not been modified, the disk might have the correct data, so you can directly drop the page, ant reload it when necessary. So this patchset is just try to solve such problem: if the error is consumed in user-space and the error occurs on a clean page, you can directly drop the memory page without killing process. If the corrupted page is clean, just dropped it and return to user-space without side effects. And if corrupted page is dirty, memory_failure() will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset, do_sea() will just send SIGBUS, so the process was killed in the same place. Because memory_failure() may sleep, we can not call it directly in SEA exception context. So we saved faulting physical address associated with a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return from SEA exception context and get into do_notify_resume() before the process running, we could check it and call memory_failure() to do recovery. It's safe, because we are in process context. In some platform, when SEA triggerred, physical address could be reported by memory section or by processor section, so we save address at this two place. --- v5 - v4: - rebased on top of 4.15-rc9 + efi patches efi patches: https://patchwork.codeaurora.org/patch/415877/ https://patchwork.codeaurora.org/patch/415879/ - add Tyler & Xiongfeng's Tested-by. v4 - v3: - rebase on top of the latest mainline - make ghes_arm_process_error as a weak function - only pick cache error from arm processor section for error recovery - fix s-o-b issue https://lkml.org/lkml/2017/9/7/98 v3 - v2: - fix patch style issue v2 - v1: - wrap arm_proc_error_check and log_arm_hw_error in a single arm_process_error() - fix sea_save_info return value issue - fix link error if this CONFIG_ARM64_ERR_RECOV is not selected - use a notify chain instead of call arch_apei_report_mem_error directly https://lkml.org/lkml/2017/9/1/189 Xie XiuQi (3): arm64/ras: support sea error recovery GHES: add a notify chain for process memory section arm64/ras: save error address from memory section for recovery arch/arm64/Kconfig | 11 +++ arch/arm64/include/asm/ras.h | 23 + arch/arm64/include/asm/thread_info.h | 4 +- arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/ras.c | 173 +++ arch/arm64/kernel/signal.c | 7 ++ arch/arm64/mm/fault.c| 27 -- drivers/acpi/apei/ghes.c | 18 +++- include/acpi/ghes.h | 11 +++ 9 files changed, 265 insertions(+), 10 deletions(-) create mode 100644 arch/arm64/include/asm/ras.h create mode 100644 arch/arm64/kernel/ras.c -- 1.8.3.1
[PATCH v5 3/3] arm64/ras: save error address from memory section for recovery
In some platform, when SEA triggerred, physical address might be reported by memory section, so we save it for error recovery later. Signed-off-by: Xie XiuQi <xiexi...@huawei.com> Tested-by: Wang Xiongfeng <wangxiongfe...@huawei.com> Tested-by: Tyler Baicar <tbai...@codeaurora.org> --- arch/arm64/kernel/ras.c | 31 +++ 1 file changed, 31 insertions(+) diff --git a/arch/arm64/kernel/ras.c b/arch/arm64/kernel/ras.c index 05d3871..4690193 100644 --- a/arch/arm64/kernel/ras.c +++ b/arch/arm64/kernel/ras.c @@ -140,3 +140,34 @@ void ghes_arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm *err) if (info_saved) set_thread_flag(TIF_SEA_NOTIFY); } + +int ghes_mem_err_callback(struct notifier_block *nb, unsigned long val, void *data) +{ + bool info_saved = false; + struct ghes_mem_err *ghes_mem = (struct ghes_mem_err *)data; + struct cper_sec_mem_err *mem_err = ghes_mem->mem_err; + + if ((ghes_mem->notify_type != ACPI_HEST_NOTIFY_SEA) || + (ghes_mem->severity != CPER_SEV_RECOVERABLE)) + return 0; + + if (mem_err->validation_bits & CPER_MEM_VALID_PA) + info_saved = sea_save_info(mem_err->physical_addr); + + if (info_saved) + set_thread_flag(TIF_SEA_NOTIFY); + + return 0; +} + +static struct notifier_block ghes_mem_err_nb = { + .notifier_call = ghes_mem_err_callback, +}; + +static int arm64_err_recov_init(void) +{ + atomic_notifier_chain_register(_mem_err_chain, _mem_err_nb); + return 0; +} + +late_initcall(arm64_err_recov_init); -- 1.8.3.1
[PATCH v5 0/3] arm64/ras: support sea error recovery
With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors are consumed. According to the existing process, errors occurred in the kernel, leading to direct panic, if it occurred the user-space, we should just kill process. But there is a class of error, in fact, is not necessary to kill process, you can recover and continue to run the process. Such as the instruction data corrupted, where the memory page might be read-only, which is has not been modified, the disk might have the correct data, so you can directly drop the page, ant reload it when necessary. So this patchset is just try to solve such problem: if the error is consumed in user-space and the error occurs on a clean page, you can directly drop the memory page without killing process. If the corrupted page is clean, just dropped it and return to user-space without side effects. And if corrupted page is dirty, memory_failure() will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset, do_sea() will just send SIGBUS, so the process was killed in the same place. Because memory_failure() may sleep, we can not call it directly in SEA exception context. So we saved faulting physical address associated with a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return from SEA exception context and get into do_notify_resume() before the process running, we could check it and call memory_failure() to do recovery. It's safe, because we are in process context. In some platform, when SEA triggerred, physical address could be reported by memory section or by processor section, so we save address at this two place. --- v5 - v4: - rebased on top of 4.15-rc9 + efi patches efi patches: https://patchwork.codeaurora.org/patch/415877/ https://patchwork.codeaurora.org/patch/415879/ - add Tyler & Xiongfeng's Tested-by. v4 - v3: - rebase on top of the latest mainline - make ghes_arm_process_error as a weak function - only pick cache error from arm processor section for error recovery - fix s-o-b issue https://lkml.org/lkml/2017/9/7/98 v3 - v2: - fix patch style issue v2 - v1: - wrap arm_proc_error_check and log_arm_hw_error in a single arm_process_error() - fix sea_save_info return value issue - fix link error if this CONFIG_ARM64_ERR_RECOV is not selected - use a notify chain instead of call arch_apei_report_mem_error directly https://lkml.org/lkml/2017/9/1/189 Xie XiuQi (3): arm64/ras: support sea error recovery GHES: add a notify chain for process memory section arm64/ras: save error address from memory section for recovery arch/arm64/Kconfig | 11 +++ arch/arm64/include/asm/ras.h | 23 + arch/arm64/include/asm/thread_info.h | 4 +- arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/ras.c | 173 +++ arch/arm64/kernel/signal.c | 7 ++ arch/arm64/mm/fault.c| 27 -- drivers/acpi/apei/ghes.c | 18 +++- include/acpi/ghes.h | 11 +++ 9 files changed, 265 insertions(+), 10 deletions(-) create mode 100644 arch/arm64/include/asm/ras.h create mode 100644 arch/arm64/kernel/ras.c -- 1.8.3.1
[PATCH v5 3/3] arm64/ras: save error address from memory section for recovery
In some platform, when SEA triggerred, physical address might be reported by memory section, so we save it for error recovery later. Signed-off-by: Xie XiuQi Tested-by: Wang Xiongfeng Tested-by: Tyler Baicar --- arch/arm64/kernel/ras.c | 31 +++ 1 file changed, 31 insertions(+) diff --git a/arch/arm64/kernel/ras.c b/arch/arm64/kernel/ras.c index 05d3871..4690193 100644 --- a/arch/arm64/kernel/ras.c +++ b/arch/arm64/kernel/ras.c @@ -140,3 +140,34 @@ void ghes_arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm *err) if (info_saved) set_thread_flag(TIF_SEA_NOTIFY); } + +int ghes_mem_err_callback(struct notifier_block *nb, unsigned long val, void *data) +{ + bool info_saved = false; + struct ghes_mem_err *ghes_mem = (struct ghes_mem_err *)data; + struct cper_sec_mem_err *mem_err = ghes_mem->mem_err; + + if ((ghes_mem->notify_type != ACPI_HEST_NOTIFY_SEA) || + (ghes_mem->severity != CPER_SEV_RECOVERABLE)) + return 0; + + if (mem_err->validation_bits & CPER_MEM_VALID_PA) + info_saved = sea_save_info(mem_err->physical_addr); + + if (info_saved) + set_thread_flag(TIF_SEA_NOTIFY); + + return 0; +} + +static struct notifier_block ghes_mem_err_nb = { + .notifier_call = ghes_mem_err_callback, +}; + +static int arm64_err_recov_init(void) +{ + atomic_notifier_chain_register(_mem_err_chain, _mem_err_nb); + return 0; +} + +late_initcall(arm64_err_recov_init); -- 1.8.3.1
Re: [PATCH v4 0/4] arm64/ras: support sea error recovery
Hi Tyler, On 2018/1/26 1:11, Tyler Baicar wrote: > For this series - Tested-by: Tyler Baicar <tbai...@codeaurora.org> > > Note that this will probably need to be rebased on top of these patches: Thank you for your testing, I'll rebase then. > > https://patchwork.codeaurora.org/patch/415877/ > https://patchwork.codeaurora.org/patch/415879/ > > With that, the first patch should be able to be removed because the above > patches already define the ARM error types: > > +#define CPER_ARM_CACHE_ERROR0 > +#define CPER_ARM_TLB_ERROR1 > +#define CPER_ARM_BUS_ERROR2 > +#define CPER_ARM_VENDOR_ERROR3 -- Thanks, Xie XiuQi
Re: [PATCH v4 0/4] arm64/ras: support sea error recovery
Hi Tyler, On 2018/1/26 1:11, Tyler Baicar wrote: > For this series - Tested-by: Tyler Baicar > > Note that this will probably need to be rebased on top of these patches: Thank you for your testing, I'll rebase then. > > https://patchwork.codeaurora.org/patch/415877/ > https://patchwork.codeaurora.org/patch/415879/ > > With that, the first patch should be able to be removed because the above > patches already define the ARM error types: > > +#define CPER_ARM_CACHE_ERROR0 > +#define CPER_ARM_TLB_ERROR1 > +#define CPER_ARM_BUS_ERROR2 > +#define CPER_ARM_VENDOR_ERROR3 -- Thanks, Xie XiuQi
Re: [RESEND PATCH V2] arm64: fault: avoid send SIGBUS two times
Hi Dongjiu, On 2017/12/12 0:05, Dongjiu Geng wrote: > do_sea() calls arm64_notify_die() which will always signal > user-space. It also returns whether APEI claimed the external > abort as a RAS notification. If it returns failure do_mem_abort() > will signal user-space too. > > do_mem_abort() wants to know if we handled the error, we always > call arm64_notify_die() so can always return success. > > Signed-off-by: Dongjiu Geng <gengdong...@huawei.com> > --- > 1. Address James's comments to update the commit messages > 2. Address James's comments to not change the si_code for SIGBUS > --- > arch/arm64/mm/fault.c | 7 +++ > 1 file changed, 3 insertions(+), 4 deletions(-) > > diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c > index b64958b..38b9f3e 100644 > --- a/arch/arm64/mm/fault.c > +++ b/arch/arm64/mm/fault.c > @@ -610,7 +610,6 @@ static int do_sea(unsigned long addr, unsigned int esr, > struct pt_regs *regs) > { > struct siginfo info; > const struct fault_info *inf; > - int ret = 0; > > inf = esr_to_fault_info(esr); > pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n", > @@ -625,7 +624,7 @@ static int do_sea(unsigned long addr, unsigned int esr, > struct pt_regs *regs) > if (interrupts_enabled(regs)) > nmi_enter(); > > - ret = ghes_notify_sea(); > + ghes_notify_sea(); > > if (interrupts_enabled(regs)) > nmi_exit(); > @@ -640,7 +639,7 @@ static int do_sea(unsigned long addr, unsigned int esr, > struct pt_regs *regs) > info.si_addr = (void __user *)addr; > arm64_notify_die("", regs, , esr); > > - return ret; > + return 0; It looks good to me. do_sea() has done all necessary action for SEA, so it should always return 0, no matter ghes_notify_sea() return true or false. Reviewed-by: Xie XiuQi <xiexi...@huawei.com> > } > > static const struct fault_info fault_info[] = { > -- > 2.10.1 > -- Thanks, Xie XiuQi
Re: [RESEND PATCH V2] arm64: fault: avoid send SIGBUS two times
Hi Dongjiu, On 2017/12/12 0:05, Dongjiu Geng wrote: > do_sea() calls arm64_notify_die() which will always signal > user-space. It also returns whether APEI claimed the external > abort as a RAS notification. If it returns failure do_mem_abort() > will signal user-space too. > > do_mem_abort() wants to know if we handled the error, we always > call arm64_notify_die() so can always return success. > > Signed-off-by: Dongjiu Geng > --- > 1. Address James's comments to update the commit messages > 2. Address James's comments to not change the si_code for SIGBUS > --- > arch/arm64/mm/fault.c | 7 +++ > 1 file changed, 3 insertions(+), 4 deletions(-) > > diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c > index b64958b..38b9f3e 100644 > --- a/arch/arm64/mm/fault.c > +++ b/arch/arm64/mm/fault.c > @@ -610,7 +610,6 @@ static int do_sea(unsigned long addr, unsigned int esr, > struct pt_regs *regs) > { > struct siginfo info; > const struct fault_info *inf; > - int ret = 0; > > inf = esr_to_fault_info(esr); > pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n", > @@ -625,7 +624,7 @@ static int do_sea(unsigned long addr, unsigned int esr, > struct pt_regs *regs) > if (interrupts_enabled(regs)) > nmi_enter(); > > - ret = ghes_notify_sea(); > + ghes_notify_sea(); > > if (interrupts_enabled(regs)) > nmi_exit(); > @@ -640,7 +639,7 @@ static int do_sea(unsigned long addr, unsigned int esr, > struct pt_regs *regs) > info.si_addr = (void __user *)addr; > arm64_notify_die("", regs, , esr); > > - return ret; > + return 0; It looks good to me. do_sea() has done all necessary action for SEA, so it should always return 0, no matter ghes_notify_sea() return true or false. Reviewed-by: Xie XiuQi > } > > static const struct fault_info fault_info[] = { > -- > 2.10.1 > -- Thanks, Xie XiuQi
[tip:ras/core] x86/MCE: Extend table to report action optional errors through CMCI too
Commit-ID: e085ac7a6ddbd746966083c5e13aa290c3e9a253 Gitweb: https://git.kernel.org/tip/e085ac7a6ddbd746966083c5e13aa290c3e9a253 Author: Xie XiuQi <xiexi...@huawei.com> AuthorDate: Mon, 4 Dec 2017 17:54:37 +0100 Committer: Thomas Gleixner <t...@linutronix.de> CommitDate: Mon, 4 Dec 2017 20:38:44 +0100 x86/MCE: Extend table to report action optional errors through CMCI too According to the Intel SDM Volume 3B (253669-063US, July 2017), action optional (SRAO) errors can be reported either via MCE or CMC: In cases when SRAO is signaled via CMCI the error signature is indicated via UC=1, PCC=0, S=0. Type(*1) UC EN PCC S AR Signaling --- UC1 1 1 x x MCE SRAR 1 1 0 1 1 MCE SRAO 1 x(*2) 0 x(*2) 0 MCE/CMC UCNA 1 x 0 0 0 CMC CE0 x x x x CMC NOTES: 1. SRAR, SRAO and UCNA errors are supported by the processor only when IA32_MCG_CAP[24] (MCG_SER_P) is set. 2. EN=1, S=1 when signaled via MCE. EN=x, S=0 when signaled via CMC. And there is a description in 15.6.2 UCR Error Reporting and Logging, for bit S: S (Signaling) flag, bit 56 - Indicates (when set) that a machine check exception was generated for the UCR error reported in this MC bank... When the S flag in the IA32_MCi_STATUS register is clear, this UCR error was not signaled via a machine check exception and instead was reported as a corrected machine check (CMC). So merge the two cases and just remove the S=0 check for SRAO in mce_severity(). [ Borislav: Massage commit message.] Signed-off-by: Xie XiuQi <xiexi...@huawei.com> Signed-off-by: Borislav Petkov <b...@suse.de> Signed-off-by: Thomas Gleixner <t...@linutronix.de> Reviewed-by: Tony Luck <tony.l...@intel.com> Tested-by: Chen Wei <chenwe...@huawei.com> Cc: linux-edac <linux-e...@vger.kernel.org> Link: http://lkml.kernel.org/r/1511575548-41992-1-git-send-email-xiexi...@huawei.com --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 26 +- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 4ca632a..5bbd06f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -59,6 +59,7 @@ static struct severity { #define MCGMASK(x, y) .mcgmask = x, .mcgres = y #define MASK(x, y).mask = x, .result = y #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) +#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR) #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) #defineMCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) @@ -101,6 +102,22 @@ static struct severity { NOSER, BITCLR(MCI_STATUS_UC) ), + /* +* known AO MCACODs reported via MCE or CMC: +* +* SRAO could be signaled either via a machine check exception or +* CMCI with the corresponding bit S 1 or 0. So we don't need to +* check bit S for SRAO. +*/ + MCESEV( + AO, "Action optional: memory scrubbing error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB) + ), + MCESEV( + AO, "Action optional: last level cache writeback error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) + ), + /* ignore OVER for UCNA */ MCESEV( UCNA, "Uncorrected no action required", @@ -149,15 +166,6 @@ static struct severity { SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) ), - /* known AO MCACODs: */ - MCESEV( - AO, "Action optional: memory scrubbing error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB) - ), - MCESEV( - AO, "Action optional: last level cache writeback error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB) - ), MCESEV( SOME, "Action optional: unknown MCACOD", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
[tip:ras/core] x86/MCE: Extend table to report action optional errors through CMCI too
Commit-ID: e085ac7a6ddbd746966083c5e13aa290c3e9a253 Gitweb: https://git.kernel.org/tip/e085ac7a6ddbd746966083c5e13aa290c3e9a253 Author: Xie XiuQi AuthorDate: Mon, 4 Dec 2017 17:54:37 +0100 Committer: Thomas Gleixner CommitDate: Mon, 4 Dec 2017 20:38:44 +0100 x86/MCE: Extend table to report action optional errors through CMCI too According to the Intel SDM Volume 3B (253669-063US, July 2017), action optional (SRAO) errors can be reported either via MCE or CMC: In cases when SRAO is signaled via CMCI the error signature is indicated via UC=1, PCC=0, S=0. Type(*1) UC EN PCC S AR Signaling --- UC1 1 1 x x MCE SRAR 1 1 0 1 1 MCE SRAO 1 x(*2) 0 x(*2) 0 MCE/CMC UCNA 1 x 0 0 0 CMC CE0 x x x x CMC NOTES: 1. SRAR, SRAO and UCNA errors are supported by the processor only when IA32_MCG_CAP[24] (MCG_SER_P) is set. 2. EN=1, S=1 when signaled via MCE. EN=x, S=0 when signaled via CMC. And there is a description in 15.6.2 UCR Error Reporting and Logging, for bit S: S (Signaling) flag, bit 56 - Indicates (when set) that a machine check exception was generated for the UCR error reported in this MC bank... When the S flag in the IA32_MCi_STATUS register is clear, this UCR error was not signaled via a machine check exception and instead was reported as a corrected machine check (CMC). So merge the two cases and just remove the S=0 check for SRAO in mce_severity(). [ Borislav: Massage commit message.] Signed-off-by: Xie XiuQi Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Tested-by: Chen Wei Cc: linux-edac Link: http://lkml.kernel.org/r/1511575548-41992-1-git-send-email-xiexi...@huawei.com --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 26 +- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 4ca632a..5bbd06f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -59,6 +59,7 @@ static struct severity { #define MCGMASK(x, y) .mcgmask = x, .mcgres = y #define MASK(x, y).mask = x, .result = y #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) +#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR) #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) #defineMCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) @@ -101,6 +102,22 @@ static struct severity { NOSER, BITCLR(MCI_STATUS_UC) ), + /* +* known AO MCACODs reported via MCE or CMC: +* +* SRAO could be signaled either via a machine check exception or +* CMCI with the corresponding bit S 1 or 0. So we don't need to +* check bit S for SRAO. +*/ + MCESEV( + AO, "Action optional: memory scrubbing error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB) + ), + MCESEV( + AO, "Action optional: last level cache writeback error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) + ), + /* ignore OVER for UCNA */ MCESEV( UCNA, "Uncorrected no action required", @@ -149,15 +166,6 @@ static struct severity { SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) ), - /* known AO MCACODs: */ - MCESEV( - AO, "Action optional: memory scrubbing error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB) - ), - MCESEV( - AO, "Action optional: last level cache writeback error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB) - ), MCESEV( SOME, "Action optional: unknown MCACOD", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
[PATCH resend] trace/xdp: fix compile warning: 'struct bpf_map' declared inside parameter list
We meet this compile warning, which caused by missing bpf.h in xdp.h. In file included from ./include/trace/events/xdp.h:10:0, from ./include/linux/bpf_trace.h:6, from drivers/net/ethernet/intel/i40e/i40e_txrx.c:29: ./include/trace/events/xdp.h:93:17: warning: ‘struct bpf_map’ declared inside parameter list will not be visible outside of this definition or declaration const struct bpf_map *map, u32 map_index), ^ ./include/linux/tracepoint.h:187:34: note: in definition of macro ‘__DECLARE_TRACE’ static inline void trace_##name(proto)\ ^ ./include/linux/tracepoint.h:352:24: note: in expansion of macro ‘PARAMS’ __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \ ^~ ./include/linux/tracepoint.h:477:2: note: in expansion of macro ‘DECLARE_TRACE’ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) ^ ./include/linux/tracepoint.h:477:22: note: in expansion of macro ‘PARAMS’ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) ^~ ./include/trace/events/xdp.h:89:1: note: in expansion of macro ‘DEFINE_EVENT’ DEFINE_EVENT(xdp_redirect_template, xdp_redirect, ^~~~ ./include/trace/events/xdp.h:90:2: note: in expansion of macro ‘TP_PROTO’ TP_PROTO(const struct net_device *dev, ^~~~ ./include/trace/events/xdp.h:93:17: warning: ‘struct bpf_map’ declared inside parameter list will not be visible outside of this definition or declaration const struct bpf_map *map, u32 map_index), ^ ./include/linux/tracepoint.h:203:38: note: in definition of macro ‘__DECLARE_TRACE’ register_trace_##name(void (*probe)(data_proto), void *data) \ ^~ ./include/linux/tracepoint.h:354:4: note: in expansion of macro ‘PARAMS’ PARAMS(void *__data, proto), \ ^~ Reported-by: Huang Daode <huangda...@hisilicon.com> Cc: Hanjun Guo <guohan...@huawei.com> Fixes: 8d3b778ff544 ("xdp: tracepoint xdp_redirect also need a map argument") Signed-off-by: Xie XiuQi <xiexi...@huawei.com> Acked-by: Jesper Dangaard Brouer <bro...@redhat.com> Acked-by: Steven Rostedt (VMware) <rost...@goodmis.org> --- include/trace/events/xdp.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 4cd0f05..8989a92 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -8,6 +8,7 @@ #include #include #include +#include #define __XDP_ACT_MAP(FN) \ FN(ABORTED) \ -- 1.8.3.1
[PATCH resend] trace/xdp: fix compile warning: 'struct bpf_map' declared inside parameter list
We meet this compile warning, which caused by missing bpf.h in xdp.h. In file included from ./include/trace/events/xdp.h:10:0, from ./include/linux/bpf_trace.h:6, from drivers/net/ethernet/intel/i40e/i40e_txrx.c:29: ./include/trace/events/xdp.h:93:17: warning: ‘struct bpf_map’ declared inside parameter list will not be visible outside of this definition or declaration const struct bpf_map *map, u32 map_index), ^ ./include/linux/tracepoint.h:187:34: note: in definition of macro ‘__DECLARE_TRACE’ static inline void trace_##name(proto)\ ^ ./include/linux/tracepoint.h:352:24: note: in expansion of macro ‘PARAMS’ __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \ ^~ ./include/linux/tracepoint.h:477:2: note: in expansion of macro ‘DECLARE_TRACE’ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) ^ ./include/linux/tracepoint.h:477:22: note: in expansion of macro ‘PARAMS’ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) ^~ ./include/trace/events/xdp.h:89:1: note: in expansion of macro ‘DEFINE_EVENT’ DEFINE_EVENT(xdp_redirect_template, xdp_redirect, ^~~~ ./include/trace/events/xdp.h:90:2: note: in expansion of macro ‘TP_PROTO’ TP_PROTO(const struct net_device *dev, ^~~~ ./include/trace/events/xdp.h:93:17: warning: ‘struct bpf_map’ declared inside parameter list will not be visible outside of this definition or declaration const struct bpf_map *map, u32 map_index), ^ ./include/linux/tracepoint.h:203:38: note: in definition of macro ‘__DECLARE_TRACE’ register_trace_##name(void (*probe)(data_proto), void *data) \ ^~ ./include/linux/tracepoint.h:354:4: note: in expansion of macro ‘PARAMS’ PARAMS(void *__data, proto), \ ^~ Reported-by: Huang Daode Cc: Hanjun Guo Fixes: 8d3b778ff544 ("xdp: tracepoint xdp_redirect also need a map argument") Signed-off-by: Xie XiuQi Acked-by: Jesper Dangaard Brouer Acked-by: Steven Rostedt (VMware) --- include/trace/events/xdp.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 4cd0f05..8989a92 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -8,6 +8,7 @@ #include #include #include +#include #define __XDP_ACT_MAP(FN) \ FN(ABORTED) \ -- 1.8.3.1
Re: [PATCH] trace/xdp: fix compile warning: ‘struct bpf_map’ declared inside parameter list
On 2017/11/29 19:13, Daniel Borkmann wrote: > Xie, thanks for the patch! We could route this fix via bpf tree if you want. > > Could you resend your patch with below Fixes and Acked-by tag added to > net...@vger.kernel.org in Cc, so that it ends up in patchwork there? > Sure, I'll resend soon. >> Fixes: 8d3b778ff544 ("xdp: tracepoint xdp_redirect also need a map argument") >> >> Acked-by: Jesper Dangaard Brouer <bro...@redhat.com> > Thanks, > Daniel -- Thanks, Xie XiuQi
Re: [PATCH] trace/xdp: fix compile warning: ‘struct bpf_map’ declared inside parameter list
On 2017/11/29 19:13, Daniel Borkmann wrote: > Xie, thanks for the patch! We could route this fix via bpf tree if you want. > > Could you resend your patch with below Fixes and Acked-by tag added to > net...@vger.kernel.org in Cc, so that it ends up in patchwork there? > Sure, I'll resend soon. >> Fixes: 8d3b778ff544 ("xdp: tracepoint xdp_redirect also need a map argument") >> >> Acked-by: Jesper Dangaard Brouer > Thanks, > Daniel -- Thanks, Xie XiuQi
[PATCH] trace/xdp: fix compile warning: ‘struct bpf_map’ declared inside parameter list
We meet this compile warning, which caused by missing bpf.h in xdp.h. In file included from ./include/trace/events/xdp.h:10:0, from ./include/linux/bpf_trace.h:6, from drivers/net/ethernet/intel/i40e/i40e_txrx.c:29: ./include/trace/events/xdp.h:93:17: warning: ‘struct bpf_map’ declared inside parameter list will not be visible outside of this definition or declaration const struct bpf_map *map, u32 map_index), ^ ./include/linux/tracepoint.h:187:34: note: in definition of macro ‘__DECLARE_TRACE’ static inline void trace_##name(proto)\ ^ ./include/linux/tracepoint.h:352:24: note: in expansion of macro ‘PARAMS’ __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \ ^~ ./include/linux/tracepoint.h:477:2: note: in expansion of macro ‘DECLARE_TRACE’ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) ^ ./include/linux/tracepoint.h:477:22: note: in expansion of macro ‘PARAMS’ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) ^~ ./include/trace/events/xdp.h:89:1: note: in expansion of macro ‘DEFINE_EVENT’ DEFINE_EVENT(xdp_redirect_template, xdp_redirect, ^~~~ ./include/trace/events/xdp.h:90:2: note: in expansion of macro ‘TP_PROTO’ TP_PROTO(const struct net_device *dev, ^~~~ ./include/trace/events/xdp.h:93:17: warning: ‘struct bpf_map’ declared inside parameter list will not be visible outside of this definition or declaration const struct bpf_map *map, u32 map_index), ^ ./include/linux/tracepoint.h:203:38: note: in definition of macro ‘__DECLARE_TRACE’ register_trace_##name(void (*probe)(data_proto), void *data) \ ^~ ./include/linux/tracepoint.h:354:4: note: in expansion of macro ‘PARAMS’ PARAMS(void *__data, proto), \ ^~ Reported-by: Huang Daode <huangda...@hisilicon.com> Cc: Hanjun Guo <guohan...@huawei.com> Signed-off-by: Xie XiuQi <xiexi...@huawei.com> --- include/trace/events/xdp.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 4cd0f05..8989a92 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -8,6 +8,7 @@ #include #include #include +#include #define __XDP_ACT_MAP(FN) \ FN(ABORTED) \ -- 1.8.3.1
[PATCH] trace/xdp: fix compile warning: ‘struct bpf_map’ declared inside parameter list
We meet this compile warning, which caused by missing bpf.h in xdp.h. In file included from ./include/trace/events/xdp.h:10:0, from ./include/linux/bpf_trace.h:6, from drivers/net/ethernet/intel/i40e/i40e_txrx.c:29: ./include/trace/events/xdp.h:93:17: warning: ‘struct bpf_map’ declared inside parameter list will not be visible outside of this definition or declaration const struct bpf_map *map, u32 map_index), ^ ./include/linux/tracepoint.h:187:34: note: in definition of macro ‘__DECLARE_TRACE’ static inline void trace_##name(proto)\ ^ ./include/linux/tracepoint.h:352:24: note: in expansion of macro ‘PARAMS’ __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \ ^~ ./include/linux/tracepoint.h:477:2: note: in expansion of macro ‘DECLARE_TRACE’ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) ^ ./include/linux/tracepoint.h:477:22: note: in expansion of macro ‘PARAMS’ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) ^~ ./include/trace/events/xdp.h:89:1: note: in expansion of macro ‘DEFINE_EVENT’ DEFINE_EVENT(xdp_redirect_template, xdp_redirect, ^~~~ ./include/trace/events/xdp.h:90:2: note: in expansion of macro ‘TP_PROTO’ TP_PROTO(const struct net_device *dev, ^~~~ ./include/trace/events/xdp.h:93:17: warning: ‘struct bpf_map’ declared inside parameter list will not be visible outside of this definition or declaration const struct bpf_map *map, u32 map_index), ^ ./include/linux/tracepoint.h:203:38: note: in definition of macro ‘__DECLARE_TRACE’ register_trace_##name(void (*probe)(data_proto), void *data) \ ^~ ./include/linux/tracepoint.h:354:4: note: in expansion of macro ‘PARAMS’ PARAMS(void *__data, proto), \ ^~ Reported-by: Huang Daode Cc: Hanjun Guo Signed-off-by: Xie XiuQi --- include/trace/events/xdp.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 4cd0f05..8989a92 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -8,6 +8,7 @@ #include #include #include +#include #define __XDP_ACT_MAP(FN) \ FN(ABORTED) \ -- 1.8.3.1
[PATCH v2] x86/mce: add support SRAO reported via CMC check
In Intel SDM Volume 3B (253669-063US, July 2017), SRAO could be reported either via MCE or CMC: In cases when SRAO is signaled via CMCI the error signature is indicated via UC=1, PCC=0, S=0. Type(*1) UC EN PCC S AR Signaling --- UC1 1 1 x x MCE SRAR 1 1 0 1 1 MCE SRAO 1 x(*2) 0 x(*2) 0 MCE/CMC UCNA 1 x 0 0 0 CMC CE0 x x x x CMC NOTES: 1. SRAR, SRAO and UCNA errors are supported by the processor only when IA32_MCG_CAP[24] (MCG_SER_P) is set. 2. EN=1, S=1 when signaled via MCE. EN=x, S=0 when signaled via CMC. And there is a description in 15.6.2 UCR Error Reporting and Logging, for bit S: S (Signaling) flag, bit 56 - Indicates (when set) that a machine check exception was generated for the UCR error reported in this MC bank... When the S flag in the IA32_MCi_STATUS register is clear, this UCR error was not signaled via a machine check exception and instead was reported as a corrected machine check (CMC). So we could merge this two case, and just remove the S=0 check for SRAO in mce_severity(). --- v2: add OVER=0 check and merge MCE and CMC case. Signed-off-by: Xie XiuQi <xiexi...@huawei.com> Tested-by: Chen Wei <chenwe...@huawei.com> Reviewed-by: Tony Luck <tony.l...@intel.com> --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 26 +- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 4ca632a..5bbd06f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -59,6 +59,7 @@ #define MCGMASK(x, y) .mcgmask = x, .mcgres = y #define MASK(x, y).mask = x, .result = y #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) +#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR) #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) #defineMCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) @@ -101,6 +102,22 @@ NOSER, BITCLR(MCI_STATUS_UC) ), + /* +* known AO MCACODs reported via MCE or CMC: +* +* SRAO could be signaled either via a machine check exception or +* CMCI with the corresponding bit S 1 or 0. So we don't need to +* check bit S for SRAO. +*/ + MCESEV( + AO, "Action optional: memory scrubbing error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB) + ), + MCESEV( + AO, "Action optional: last level cache writeback error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) + ), + /* ignore OVER for UCNA */ MCESEV( UCNA, "Uncorrected no action required", @@ -149,15 +166,6 @@ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) ), - /* known AO MCACODs: */ - MCESEV( - AO, "Action optional: memory scrubbing error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB) - ), - MCESEV( - AO, "Action optional: last level cache writeback error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB) - ), MCESEV( SOME, "Action optional: unknown MCACOD", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S) -- 1.8.3.1
[PATCH v2] x86/mce: add support SRAO reported via CMC check
In Intel SDM Volume 3B (253669-063US, July 2017), SRAO could be reported either via MCE or CMC: In cases when SRAO is signaled via CMCI the error signature is indicated via UC=1, PCC=0, S=0. Type(*1) UC EN PCC S AR Signaling --- UC1 1 1 x x MCE SRAR 1 1 0 1 1 MCE SRAO 1 x(*2) 0 x(*2) 0 MCE/CMC UCNA 1 x 0 0 0 CMC CE0 x x x x CMC NOTES: 1. SRAR, SRAO and UCNA errors are supported by the processor only when IA32_MCG_CAP[24] (MCG_SER_P) is set. 2. EN=1, S=1 when signaled via MCE. EN=x, S=0 when signaled via CMC. And there is a description in 15.6.2 UCR Error Reporting and Logging, for bit S: S (Signaling) flag, bit 56 - Indicates (when set) that a machine check exception was generated for the UCR error reported in this MC bank... When the S flag in the IA32_MCi_STATUS register is clear, this UCR error was not signaled via a machine check exception and instead was reported as a corrected machine check (CMC). So we could merge this two case, and just remove the S=0 check for SRAO in mce_severity(). --- v2: add OVER=0 check and merge MCE and CMC case. Signed-off-by: Xie XiuQi Tested-by: Chen Wei Reviewed-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 26 +- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 4ca632a..5bbd06f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -59,6 +59,7 @@ #define MCGMASK(x, y) .mcgmask = x, .mcgres = y #define MASK(x, y).mask = x, .result = y #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) +#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR) #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) #defineMCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) @@ -101,6 +102,22 @@ NOSER, BITCLR(MCI_STATUS_UC) ), + /* +* known AO MCACODs reported via MCE or CMC: +* +* SRAO could be signaled either via a machine check exception or +* CMCI with the corresponding bit S 1 or 0. So we don't need to +* check bit S for SRAO. +*/ + MCESEV( + AO, "Action optional: memory scrubbing error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB) + ), + MCESEV( + AO, "Action optional: last level cache writeback error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) + ), + /* ignore OVER for UCNA */ MCESEV( UCNA, "Uncorrected no action required", @@ -149,15 +166,6 @@ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) ), - /* known AO MCACODs: */ - MCESEV( - AO, "Action optional: memory scrubbing error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB) - ), - MCESEV( - AO, "Action optional: last level cache writeback error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB) - ), MCESEV( SOME, "Action optional: unknown MCACOD", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S) -- 1.8.3.1
Re: [PATCH] x86/mce: add support SRAO reported via CMC check
Hi Borislav, Tony, On 2017/11/15 18:33, Borislav Petkov wrote: > On Wed, Nov 15, 2017 at 02:44:07AM +, Luck, Tony wrote: >> This code is subtle :-( > > I'm glad that we agree on this! :-) > > Anyone wanting to rewrite it yet? > In Intel SDM Volume 3B (253669-063US, July 2017), SRAO could be reported either via MCE or CMC: In cases when SRAO is signaled via CMCI the error signature is indicated via UC=1, PCC=0, S=0. Type(*1) UC EN PCC S AR Signaling --- UC1 1 1 x x MCE SRAR 1 1 0 1 1 MCE SRAO 1 x(*2) 0 x(*2) 0 MCE/CMC UCNA 1 x 0 0 0 CMC CE0 x x x x CMC NOTES: 1. SRAR, SRAO and UCNA errors are supported by the processor only when IA32_MCG_CAP[24] (MCG_SER_P) is set. 2. EN=1, S=1 when signaled via MCE. EN=x, S=0 when signaled via CMC. And there is a description in 15.6.2 UCR Error Reporting and Logging, for bit S: S (Signaling) flag, bit 56 - Indicates (when set) that a machine check exception was generated for the UCR error reported in this MC bank... When the S flag in the IA32_MCi_STATUS register is clear, this UCR error was not signaled via a machine check exception and instead was reported as a corrected machine check (CMC). As the description in SDM, I think this flag could be used to determine whether MCE or CMC was triggered. So we could merge this two case in one and just remove the S=0 check for SRAO. How about this patch? >From a06b2a781a86e3b1fe241591b53f7a6d33d63331 Mon Sep 17 00:00:00 2001 From: Xie XiuQi <xiexi...@huawei.com> Date: Tue, 14 Nov 2017 10:13:22 +0800 Subject: [PATCH] x86/mce: add support SRAO reported via CMC check In Intel SDM Volume 3B (253669-063US, July 2017), SRAO could be reported either via MCE or CMC: In cases when SRAO is signaled via CMCI the error signature is indicated via UC=1, PCC=0, S=0. Type(*1) UC EN PCC S AR Signaling --- UC1 1 1 x x MCE SRAR 1 1 0 1 1 MCE SRAO 1 x(*2) 0 x(*2) 0 MCE/CMC UCNA 1 x 0 0 0 CMC CE0 x x x x CMC NOTES: 1. SRAR, SRAO and UCNA errors are supported by the processor only when IA32_MCG_CAP[24] (MCG_SER_P) is set. 2. EN=1, S=1 when signaled via MCE. EN=x, S=0 when signaled via CMC. And there is a description in 15.6.2 UCR Error Reporting and Logging, for bit S: S (Signaling) flag, bit 56 - Indicates (when set) that a machine check exception was generated for the UCR error reported in this MC bank... When the S flag in the IA32_MCi_STATUS register is clear, this UCR error was not signaled via a machine check exception and instead was reported as a corrected machine check (CMC). So we could merge this two case, and just remove the S=0 check for SRAO in mce_severity(). Signed-off-by: Xie XiuQi <xiexi...@huawei.com> Tested-by: Chen Wei <chenwe...@huawei.com> --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 26 +- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 4ca632a..5bbd06f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -59,6 +59,7 @@ #define MCGMASK(x, y) .mcgmask = x, .mcgres = y #define MASK(x, y).mask = x, .result = y #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) +#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR) #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) #defineMCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) @@ -101,6 +102,22 @@ NOSER, BITCLR(MCI_STATUS_UC) ), + /* +* known AO MCACODs reported via MCE or CMC: +* +* SRAO could be signaled either via a machine check exception or +* CMCI with the corresponding bit S 1 or 0. So we don't need to +* check bit S for SRAO. +*/ + MCESEV( + AO, "Action optional: memory scrubbing error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB) + ), + MCESEV( + AO, "Action optional: last level cache writeback error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) + ), + /* ignore OVER for UCNA */ MCESEV( UCNA, "Uncorrected no action required", @@ -149,15
Re: [PATCH] x86/mce: add support SRAO reported via CMC check
Hi Borislav, Tony, On 2017/11/15 18:33, Borislav Petkov wrote: > On Wed, Nov 15, 2017 at 02:44:07AM +, Luck, Tony wrote: >> This code is subtle :-( > > I'm glad that we agree on this! :-) > > Anyone wanting to rewrite it yet? > In Intel SDM Volume 3B (253669-063US, July 2017), SRAO could be reported either via MCE or CMC: In cases when SRAO is signaled via CMCI the error signature is indicated via UC=1, PCC=0, S=0. Type(*1) UC EN PCC S AR Signaling --- UC1 1 1 x x MCE SRAR 1 1 0 1 1 MCE SRAO 1 x(*2) 0 x(*2) 0 MCE/CMC UCNA 1 x 0 0 0 CMC CE0 x x x x CMC NOTES: 1. SRAR, SRAO and UCNA errors are supported by the processor only when IA32_MCG_CAP[24] (MCG_SER_P) is set. 2. EN=1, S=1 when signaled via MCE. EN=x, S=0 when signaled via CMC. And there is a description in 15.6.2 UCR Error Reporting and Logging, for bit S: S (Signaling) flag, bit 56 - Indicates (when set) that a machine check exception was generated for the UCR error reported in this MC bank... When the S flag in the IA32_MCi_STATUS register is clear, this UCR error was not signaled via a machine check exception and instead was reported as a corrected machine check (CMC). As the description in SDM, I think this flag could be used to determine whether MCE or CMC was triggered. So we could merge this two case in one and just remove the S=0 check for SRAO. How about this patch? >From a06b2a781a86e3b1fe241591b53f7a6d33d63331 Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Tue, 14 Nov 2017 10:13:22 +0800 Subject: [PATCH] x86/mce: add support SRAO reported via CMC check In Intel SDM Volume 3B (253669-063US, July 2017), SRAO could be reported either via MCE or CMC: In cases when SRAO is signaled via CMCI the error signature is indicated via UC=1, PCC=0, S=0. Type(*1) UC EN PCC S AR Signaling --- UC1 1 1 x x MCE SRAR 1 1 0 1 1 MCE SRAO 1 x(*2) 0 x(*2) 0 MCE/CMC UCNA 1 x 0 0 0 CMC CE0 x x x x CMC NOTES: 1. SRAR, SRAO and UCNA errors are supported by the processor only when IA32_MCG_CAP[24] (MCG_SER_P) is set. 2. EN=1, S=1 when signaled via MCE. EN=x, S=0 when signaled via CMC. And there is a description in 15.6.2 UCR Error Reporting and Logging, for bit S: S (Signaling) flag, bit 56 - Indicates (when set) that a machine check exception was generated for the UCR error reported in this MC bank... When the S flag in the IA32_MCi_STATUS register is clear, this UCR error was not signaled via a machine check exception and instead was reported as a corrected machine check (CMC). So we could merge this two case, and just remove the S=0 check for SRAO in mce_severity(). Signed-off-by: Xie XiuQi Tested-by: Chen Wei --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 26 +- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 4ca632a..5bbd06f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -59,6 +59,7 @@ #define MCGMASK(x, y) .mcgmask = x, .mcgres = y #define MASK(x, y).mask = x, .result = y #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) +#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR) #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) #defineMCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) @@ -101,6 +102,22 @@ NOSER, BITCLR(MCI_STATUS_UC) ), + /* +* known AO MCACODs reported via MCE or CMC: +* +* SRAO could be signaled either via a machine check exception or +* CMCI with the corresponding bit S 1 or 0. So we don't need to +* check bit S for SRAO. +*/ + MCESEV( + AO, "Action optional: memory scrubbing error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB) + ), + MCESEV( + AO, "Action optional: last level cache writeback error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) + ), + /* ignore OVER for UCNA */ MCESEV( UCNA, "Uncorrected no action required", @@ -149,15 +166,6 @@ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) ), - /* known A
Re: [PATCH] x86/mce: add support SRAO reported via CMC check
Hi Tony, On 2017/11/15 10:13, Luck, Tony wrote: >> OK, I'll add check for OVER=0 in v2. > > Thinking a bit more on this ... do you just need to remove the check > for S=1 from the existing "AO" entries? Or do we need the CMCI > match entry early in the table? Yes, we could just remove the check for S=1, and place "AO" entries before "UCNA" entries. Is that right? > > -Tony > -- > To unsubscribe from this list: send the line "unsubscribe linux-edac" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- Thanks, Xie XiuQi
Re: [PATCH] x86/mce: add support SRAO reported via CMC check
Hi Tony, On 2017/11/15 10:13, Luck, Tony wrote: >> OK, I'll add check for OVER=0 in v2. > > Thinking a bit more on this ... do you just need to remove the check > for S=1 from the existing "AO" entries? Or do we need the CMCI > match entry early in the table? Yes, we could just remove the check for S=1, and place "AO" entries before "UCNA" entries. Is that right? > > -Tony > -- > To unsubscribe from this list: send the line "unsubscribe linux-edac" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- Thanks, Xie XiuQi
Re: [PATCH] x86/mce: add support SRAO reported via CMC check
Hi Tony, On 2017/11/15 2:51, Luck, Tony wrote: > On Tue, Nov 14, 2017 at 01:55:11PM +0800, Xie XiuQi wrote: >> +/* known AO MCACODs reported via CMC: */ >> +MCESEV( >> +AO, "Action optional: memory scrubbing error", >> +SER, MASK(MCI_UC_SAR|MCACOD_SCRUBMSK, >> MCI_STATUS_UC|MCACOD_SCRUB) > > I think you should check for OVER=0 (as the existing AO cases do). > If there was a patrol scrub reported by CMCI, and then another UC > error, we can't safely treat this as an AO ... because we have no > idea what the second UC error was. OK, I'll add check for OVER=0 in v2. -- Thanks, Xie XiuQi > > -Tony > > . >
Re: [PATCH] x86/mce: add support SRAO reported via CMC check
Hi Tony, On 2017/11/15 2:51, Luck, Tony wrote: > On Tue, Nov 14, 2017 at 01:55:11PM +0800, Xie XiuQi wrote: >> +/* known AO MCACODs reported via CMC: */ >> +MCESEV( >> +AO, "Action optional: memory scrubbing error", >> +SER, MASK(MCI_UC_SAR|MCACOD_SCRUBMSK, >> MCI_STATUS_UC|MCACOD_SCRUB) > > I think you should check for OVER=0 (as the existing AO cases do). > If there was a patrol scrub reported by CMCI, and then another UC > error, we can't safely treat this as an AO ... because we have no > idea what the second UC error was. OK, I'll add check for OVER=0 in v2. -- Thanks, Xie XiuQi > > -Tony > > . >
[PATCH] x86/mce: add support SRAO reported via CMC check
In Intel SDM Volume 3B (253669-063US, July 2017), SRAO could be reported via CMC: In cases when SRAO is signaled via CMCI the error signature is indicated via UC=1, PCC=0, S=0. So we add those known AO MCACODs check in mce_severity(). Signed-off-by: Xie XiuQi <xiexi...@huawei.com> Tested-by: Chen Wei <chenwe...@huawei.com> --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 4ca632a..48f239a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -101,6 +101,16 @@ NOSER, BITCLR(MCI_STATUS_UC) ), + /* known AO MCACODs reported via CMC: */ + MCESEV( + AO, "Action optional: memory scrubbing error", + SER, MASK(MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB) + ), + MCESEV( + AO, "Action optional: last level cache writeback error", + SER, MASK(MCI_UC_SAR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) + ), + /* ignore OVER for UCNA */ MCESEV( UCNA, "Uncorrected no action required", -- 1.8.3.1
[PATCH] x86/mce: add support SRAO reported via CMC check
In Intel SDM Volume 3B (253669-063US, July 2017), SRAO could be reported via CMC: In cases when SRAO is signaled via CMCI the error signature is indicated via UC=1, PCC=0, S=0. So we add those known AO MCACODs check in mce_severity(). Signed-off-by: Xie XiuQi Tested-by: Chen Wei --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 4ca632a..48f239a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -101,6 +101,16 @@ NOSER, BITCLR(MCI_STATUS_UC) ), + /* known AO MCACODs reported via CMC: */ + MCESEV( + AO, "Action optional: memory scrubbing error", + SER, MASK(MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB) + ), + MCESEV( + AO, "Action optional: last level cache writeback error", + SER, MASK(MCI_UC_SAR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) + ), + /* ignore OVER for UCNA */ MCESEV( UCNA, "Uncorrected no action required", -- 1.8.3.1
[PATCH v4 0/4] arm64/ras: support sea error recovery
With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors are consumed. According to the existing process, errors occurred in the kernel, leading to direct panic, if it occurred the user-space, we should just kill process. But there is a class of error, in fact, is not necessary to kill process, you can recover and continue to run the process. Such as the instruction data corrupted, where the memory page might be read-only, which is has not been modified, the disk might have the correct data, so you can directly drop the page, ant reload it when necessary. So this patchset is just try to solve such problem: if the error is consumed in user-space and the error occurs on a clean page, you can directly drop the memory page without killing process. If the corrupted page is clean, just dropped it and return to user-space without side effects. And if corrupted page is dirty, memory_failure() will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset, do_sea() will just send SIGBUS, so the process was killed in the same place. Because memory_failure() may sleep, we can not call it directly in SEA exception context. So we saved faulting physical address associated with a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return from SEA exception context and get into do_notify_resume() before the process running, we could check it and call memory_failure() to do recovery. It's safe, because we are in process context. In some platform, when SEA triggerred, physical address could be reported by memory section or by processor section, so we save address at this two place. --- v4 - v3: - rebase on top of the latest mainline - make ghes_arm_process_error as a weak function - only pick cache error from arm processor section for error recovery - fix s-o-b issue https://lkml.org/lkml/2017/9/7/98 v3 - v2: - fix patch style issue v2 - v1: - wrap arm_proc_error_check and log_arm_hw_error in a single arm_process_error() - fix sea_save_info return value issue - fix link error if this CONFIG_ARM64_ERR_RECOV is not selected - use a notify chain instead of call arch_apei_report_mem_error directly https://lkml.org/lkml/2017/9/1/189 Xie XiuQi (4): ACPI, CPER: add arm error info type definition arm64/ras: support sea error recovery GHES: add a notify chain for process memory section arm64/ras: save error address from memory section for recovery arch/arm64/Kconfig | 11 +++ arch/arm64/include/asm/ras.h | 23 + arch/arm64/include/asm/thread_info.h | 4 +- arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/ras.c | 173 +++ arch/arm64/kernel/signal.c | 7 ++ arch/arm64/mm/fault.c| 27 -- drivers/acpi/apei/ghes.c | 18 +++- include/acpi/ghes.h | 11 +++ include/linux/cper.h | 5 + 10 files changed, 270 insertions(+), 10 deletions(-) create mode 100644 arch/arm64/include/asm/ras.h create mode 100644 arch/arm64/kernel/ras.c -- 1.8.3.1
[PATCH v4 4/4] arm64/ras: save error address from memory section for recovery
In some platform, when SEA triggerred, physical address might be reported by memory section, so we save it for error recovery later. Signed-off-by: Xie XiuQi <xiexi...@huawei.com> --- arch/arm64/kernel/ras.c | 31 +++ 1 file changed, 31 insertions(+) diff --git a/arch/arm64/kernel/ras.c b/arch/arm64/kernel/ras.c index c48503c..ff24181 100644 --- a/arch/arm64/kernel/ras.c +++ b/arch/arm64/kernel/ras.c @@ -140,3 +140,34 @@ void ghes_arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm *err) if (info_saved) set_thread_flag(TIF_SEA_NOTIFY); } + +int ghes_mem_err_callback(struct notifier_block *nb, unsigned long val, void *data) +{ + bool info_saved = false; + struct ghes_mem_err *ghes_mem = (struct ghes_mem_err *)data; + struct cper_sec_mem_err *mem_err = ghes_mem->mem_err; + + if ((ghes_mem->notify_type != ACPI_HEST_NOTIFY_SEA) || + (ghes_mem->severity != CPER_SEV_RECOVERABLE)) + return 0; + + if (mem_err->validation_bits & CPER_MEM_VALID_PA) + info_saved = sea_save_info(mem_err->physical_addr); + + if (info_saved) + set_thread_flag(TIF_SEA_NOTIFY); + + return 0; +} + +static struct notifier_block ghes_mem_err_nb = { + .notifier_call = ghes_mem_err_callback, +}; + +static int arm64_err_recov_init(void) +{ + atomic_notifier_chain_register(_mem_err_chain, _mem_err_nb); + return 0; +} + +late_initcall(arm64_err_recov_init); -- 1.8.3.1
Re: [PATCH v3 1/3] arm64/ras: support sea error recovery
Hi James, I'll send v4 to fix some small issue, for same one who interested with this feature could test it. For the question you mentioned, I will study in depth. Any comments is welcome. On 2017/9/16 2:33, James Morse wrote: > Hi Xie XiuQi, > > On 11/09/17 15:11, Xie XiuQi wrote: >> I first describe the approach of this patchset: >> >> A memory access error on the execution path usually triggers SEA. >> According to the existing process, errors occurred in the kernel, >> leading to direct panic, if it occurred the user-space, we should >> just kill process. >> >> But there is a class of error, in fact, is not necessary to kill >> process, you can recover and continue to run the process. Such as >> the instruction data corrupted, where the memory page might be >> read-only, which is has not been modified, the disk might have the >> correct data, so you can directly drop the page, ant reload it when >> necessary. >> >> So this patchset is just try to solve such problem: if the error is >> consumed in user-space and the error occurs on a clean page, you can >> directly drop the memory page without killing process. >> >> This is implemented in memory_failure, which is generic process. > >> The error reported by SEA should be handled before re-enter the process, >> or we must kill the process to prevent error propagation. >> >> memory_failure_queue() is asynchronous, in which, error info was saved >> at ghes_proc, but handled in kworker. During this period there is a context >> switching, so we can not determine which process would be switch to. So >> memory_failure_queue is not suitable for handling the problem. > > Thanks for this summary. I see the problem you're trying to solve is when > memory_failure() runs, in your scenario its not guaranteed to run before we > return to user space. > > What is the user-visible symptom of this? SIGBUS, code=0 instead of SIGBUS, > code=...MCEERR_A? > > ..in which case I'm looking at this as a race with the memory_failure() bottom > half via schedule_work(). > > How does x86 avoid this same problem? > > >> And memory_failure is not nmi-safe, so it can not be called directly in the >> SEA context. So we just handle this error at SEA exit path, and before >> context >> switching. > > (I need to look more into which locks memory_failure() is taking) > > >> In FFH mode, physical address can only be obtained by parsing the GHES table. >> But we only care about SEA, so the error handling is tied to the type of >> notification. > > I care about all the notification methods. Once the notification has been > passed > to APEI I want them to behave the same so that we don't have subtle bugs > between > the 11 different ways we could get a notification. This code is rarely tested > enough as it is. > >>From the arch code I just want to call out to APEI asking 'is this yours?'. If > so I expect APEI to have done all the work, if not we take the v8.0 behaviour. > > > Here you need APEI and the arch code to spot 'SEA' and treat it differently, > invoking some arm64-specific behaviour for APEI, and some > not-really-arch-specific code under /arch/arm64. There is nothing arm64 > specific > about your arm_process_error(), how come the core APEI code doesn't need to > do this? > > > I think this is caused by the way memory_failure() schedules its work, and > that > is where I'd like to try and fix this, so that its the same for all > notification > methods and all (cough: both) architectures. > > >> The TIF flag is checked on a generic path, but it will only be set when SEA >> occurs. >> And if we use unlikely optimization, it should have little impact on >> performance. > > Yes, the arch code checks _TIF_WORK_MASK in one go so there is no performance > problem for code that hasn't taken the RAS-Error. (and once we've taken a RAS > error performance is out the window!) > > >> And the TIF flag approach was used on x86 platform for years, until commit >> d4812e169d > > ... so x86 doesn't do this ... > >> (x86, mce: Get rid of TIF_MCE_NOTIFY and associated mce tricks)[0]. On >> currently arm64 >> platform, there is no IST interrupt[1] function, so we could not call >> memory_failure >> directly in SEA context. So the way to use TIF notification, is also a good >> choice, >> after all, the same way on x86 platform is verified. > > Thanks, looks like I need to read more of the history of x86's kernel-first > handling... > > > Thanks, > > James > > > . > -- Thanks, Xie XiuQi
[PATCH v4 0/4] arm64/ras: support sea error recovery
With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors are consumed. According to the existing process, errors occurred in the kernel, leading to direct panic, if it occurred the user-space, we should just kill process. But there is a class of error, in fact, is not necessary to kill process, you can recover and continue to run the process. Such as the instruction data corrupted, where the memory page might be read-only, which is has not been modified, the disk might have the correct data, so you can directly drop the page, ant reload it when necessary. So this patchset is just try to solve such problem: if the error is consumed in user-space and the error occurs on a clean page, you can directly drop the memory page without killing process. If the corrupted page is clean, just dropped it and return to user-space without side effects. And if corrupted page is dirty, memory_failure() will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset, do_sea() will just send SIGBUS, so the process was killed in the same place. Because memory_failure() may sleep, we can not call it directly in SEA exception context. So we saved faulting physical address associated with a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return from SEA exception context and get into do_notify_resume() before the process running, we could check it and call memory_failure() to do recovery. It's safe, because we are in process context. In some platform, when SEA triggerred, physical address could be reported by memory section or by processor section, so we save address at this two place. --- v4 - v3: - rebase on top of the latest mainline - make ghes_arm_process_error as a weak function - only pick cache error from arm processor section for error recovery - fix s-o-b issue https://lkml.org/lkml/2017/9/7/98 v3 - v2: - fix patch style issue v2 - v1: - wrap arm_proc_error_check and log_arm_hw_error in a single arm_process_error() - fix sea_save_info return value issue - fix link error if this CONFIG_ARM64_ERR_RECOV is not selected - use a notify chain instead of call arch_apei_report_mem_error directly https://lkml.org/lkml/2017/9/1/189 Xie XiuQi (4): ACPI, CPER: add arm error info type definition arm64/ras: support sea error recovery GHES: add a notify chain for process memory section arm64/ras: save error address from memory section for recovery arch/arm64/Kconfig | 11 +++ arch/arm64/include/asm/ras.h | 23 + arch/arm64/include/asm/thread_info.h | 4 +- arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/ras.c | 173 +++ arch/arm64/kernel/signal.c | 7 ++ arch/arm64/mm/fault.c| 27 -- drivers/acpi/apei/ghes.c | 18 +++- include/acpi/ghes.h | 11 +++ include/linux/cper.h | 5 + 10 files changed, 270 insertions(+), 10 deletions(-) create mode 100644 arch/arm64/include/asm/ras.h create mode 100644 arch/arm64/kernel/ras.c -- 1.8.3.1
[PATCH v4 4/4] arm64/ras: save error address from memory section for recovery
In some platform, when SEA triggerred, physical address might be reported by memory section, so we save it for error recovery later. Signed-off-by: Xie XiuQi --- arch/arm64/kernel/ras.c | 31 +++ 1 file changed, 31 insertions(+) diff --git a/arch/arm64/kernel/ras.c b/arch/arm64/kernel/ras.c index c48503c..ff24181 100644 --- a/arch/arm64/kernel/ras.c +++ b/arch/arm64/kernel/ras.c @@ -140,3 +140,34 @@ void ghes_arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm *err) if (info_saved) set_thread_flag(TIF_SEA_NOTIFY); } + +int ghes_mem_err_callback(struct notifier_block *nb, unsigned long val, void *data) +{ + bool info_saved = false; + struct ghes_mem_err *ghes_mem = (struct ghes_mem_err *)data; + struct cper_sec_mem_err *mem_err = ghes_mem->mem_err; + + if ((ghes_mem->notify_type != ACPI_HEST_NOTIFY_SEA) || + (ghes_mem->severity != CPER_SEV_RECOVERABLE)) + return 0; + + if (mem_err->validation_bits & CPER_MEM_VALID_PA) + info_saved = sea_save_info(mem_err->physical_addr); + + if (info_saved) + set_thread_flag(TIF_SEA_NOTIFY); + + return 0; +} + +static struct notifier_block ghes_mem_err_nb = { + .notifier_call = ghes_mem_err_callback, +}; + +static int arm64_err_recov_init(void) +{ + atomic_notifier_chain_register(_mem_err_chain, _mem_err_nb); + return 0; +} + +late_initcall(arm64_err_recov_init); -- 1.8.3.1
Re: [PATCH v3 1/3] arm64/ras: support sea error recovery
Hi James, I'll send v4 to fix some small issue, for same one who interested with this feature could test it. For the question you mentioned, I will study in depth. Any comments is welcome. On 2017/9/16 2:33, James Morse wrote: > Hi Xie XiuQi, > > On 11/09/17 15:11, Xie XiuQi wrote: >> I first describe the approach of this patchset: >> >> A memory access error on the execution path usually triggers SEA. >> According to the existing process, errors occurred in the kernel, >> leading to direct panic, if it occurred the user-space, we should >> just kill process. >> >> But there is a class of error, in fact, is not necessary to kill >> process, you can recover and continue to run the process. Such as >> the instruction data corrupted, where the memory page might be >> read-only, which is has not been modified, the disk might have the >> correct data, so you can directly drop the page, ant reload it when >> necessary. >> >> So this patchset is just try to solve such problem: if the error is >> consumed in user-space and the error occurs on a clean page, you can >> directly drop the memory page without killing process. >> >> This is implemented in memory_failure, which is generic process. > >> The error reported by SEA should be handled before re-enter the process, >> or we must kill the process to prevent error propagation. >> >> memory_failure_queue() is asynchronous, in which, error info was saved >> at ghes_proc, but handled in kworker. During this period there is a context >> switching, so we can not determine which process would be switch to. So >> memory_failure_queue is not suitable for handling the problem. > > Thanks for this summary. I see the problem you're trying to solve is when > memory_failure() runs, in your scenario its not guaranteed to run before we > return to user space. > > What is the user-visible symptom of this? SIGBUS, code=0 instead of SIGBUS, > code=...MCEERR_A? > > ..in which case I'm looking at this as a race with the memory_failure() bottom > half via schedule_work(). > > How does x86 avoid this same problem? > > >> And memory_failure is not nmi-safe, so it can not be called directly in the >> SEA context. So we just handle this error at SEA exit path, and before >> context >> switching. > > (I need to look more into which locks memory_failure() is taking) > > >> In FFH mode, physical address can only be obtained by parsing the GHES table. >> But we only care about SEA, so the error handling is tied to the type of >> notification. > > I care about all the notification methods. Once the notification has been > passed > to APEI I want them to behave the same so that we don't have subtle bugs > between > the 11 different ways we could get a notification. This code is rarely tested > enough as it is. > >>From the arch code I just want to call out to APEI asking 'is this yours?'. If > so I expect APEI to have done all the work, if not we take the v8.0 behaviour. > > > Here you need APEI and the arch code to spot 'SEA' and treat it differently, > invoking some arm64-specific behaviour for APEI, and some > not-really-arch-specific code under /arch/arm64. There is nothing arm64 > specific > about your arm_process_error(), how come the core APEI code doesn't need to > do this? > > > I think this is caused by the way memory_failure() schedules its work, and > that > is where I'd like to try and fix this, so that its the same for all > notification > methods and all (cough: both) architectures. > > >> The TIF flag is checked on a generic path, but it will only be set when SEA >> occurs. >> And if we use unlikely optimization, it should have little impact on >> performance. > > Yes, the arch code checks _TIF_WORK_MASK in one go so there is no performance > problem for code that hasn't taken the RAS-Error. (and once we've taken a RAS > error performance is out the window!) > > >> And the TIF flag approach was used on x86 platform for years, until commit >> d4812e169d > > ... so x86 doesn't do this ... > >> (x86, mce: Get rid of TIF_MCE_NOTIFY and associated mce tricks)[0]. On >> currently arm64 >> platform, there is no IST interrupt[1] function, so we could not call >> memory_failure >> directly in SEA context. So the way to use TIF notification, is also a good >> choice, >> after all, the same way on x86 platform is verified. > > Thanks, looks like I need to read more of the history of x86's kernel-first > handling... > > > Thanks, > > James > > > . > -- Thanks, Xie XiuQi
[PATCH v4 1/4] ACPI, CPER: add arm error info type definition
Add arm error info type definition according to ACPI6.1 Table 261. ARM Processor Error Information Structure, which is used for error recovery in the following patches. Signed-off-by: Xie XiuQi <xiexi...@huawei.com> --- include/linux/cper.h | 5 + 1 file changed, 5 insertions(+) diff --git a/include/linux/cper.h b/include/linux/cper.h index 723e952..9106e2b 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -275,6 +275,11 @@ enum { #define CPER_ARM_INFO_FLAGS_PROPAGATED BIT(2) #define CPER_ARM_INFO_FLAGS_OVERFLOW BIT(3) +#define CPER_ARM_INFO_TYPE_CACHE 0 +#define CPER_ARM_INFO_TYPE_TLB 1 +#define CPER_ARM_INFO_TYPE_BUS 2 +#define CPER_ARM_INFO_TYPE_UARCH 3 + /* * All tables and structs must be byte-packed to match CPER * specification, since the tables are provided by the system BIOS -- 1.8.3.1
[PATCH v4 2/4] arm64/ras: support sea error recovery
With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors are consumed. According to the existing process, errors occurred in the kernel, leading to direct panic, if it occurred the user-space, we should just kill process. But there is a class of error, in fact, is not necessary to kill process, you can recover and continue to run the process. Such as the instruction data corrupted, where the memory page might be read-only, which is has not been modified, the disk might have the correct data, so you can directly drop the page, ant reload it when necessary. So this patchset is just try to solve such problem: if the error is consumed in user-space and the error occurs on a clean page, you can directly drop the memory page without killing process. If the corrupted page is clean, just dropped it and return to user-space without side effects. And if corrupted page is dirty, memory_failure() will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset, do_sea() will just send SIGBUS, so the process was killed in the same place. Because memory_failure() may sleep, we can not call it directly in SEA exception context. So we saved faulting physical address associated with a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return from SEA exception context and get into do_notify_resume() before the process running, we could check it and call memory_failure() to do recovery. It's safe, because we are in process context. Signed-off-by: Xie XiuQi <xiexi...@huawei.com> Tested-by: Wang Xiongfeng <wangxiongfe...@huawei.com> Cc: James Morse <james.mo...@arm.com> Cc: Borislav Petkov <b...@suse.de> Cc: Julien Thierry <julien.thie...@arm.com> Cc: Hanjun Guo <hanjun@linaro.org> --- arch/arm64/Kconfig | 11 +++ arch/arm64/include/asm/ras.h | 23 ++ arch/arm64/include/asm/thread_info.h | 4 +- arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/ras.c | 142 +++ arch/arm64/kernel/signal.c | 7 ++ arch/arm64/mm/fault.c| 27 +-- drivers/acpi/apei/ghes.c | 8 +- include/acpi/ghes.h | 3 + 9 files changed, 216 insertions(+), 10 deletions(-) create mode 100644 arch/arm64/include/asm/ras.h create mode 100644 arch/arm64/kernel/ras.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 0df64a6..0c2d0a0 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -641,6 +641,17 @@ config HOTPLUG_CPU Say Y here to experiment with turning CPUs off and on. CPUs can be controlled through /sys/devices/system/cpu. +config ARM64_ERR_RECOV + bool "Support arm64 RAS error recovery" + depends on ACPI_APEI_SEA && MEMORY_FAILURE + help + With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors + are consumed. In some cases, if the error address is in a clean page or a + read-only page, there is a chance to recover. Such as error occurs in a + instruction page, we can reread this page from disk instead of killing process. + + Say Y if unsure. + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support" diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h new file mode 100644 index 000..f0f18da --- /dev/null +++ b/arch/arm64/include/asm/ras.h @@ -0,0 +1,23 @@ +/* + * ARM64 SEA error recoery support + * + * Copyright 2017 Huawei Technologies Co., Ltd. + * Author: Xie XiuQi <xiexi...@huawei.com> + * Author: Wang Xiongfeng <wangxiongfe...@huawei.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation; + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _ASM_RAS_H +#define _ASM_RAS_H + +extern void sea_notify_process(void); + +#endif /*_ASM_RAS_H*/ diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index ddded64..b31b308 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -82,6 +82,7 @@ struct thread_info { #define TIF_FOREIGN_FPSTATE3 /* CPU's FP state is not current's */ #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ #define TIF_FSCHECK5 /* Check FS is USER_DS on return */ +#define TIF_SEA_NOTIFY 6 /* notify to do an error recovery */ #define TIF_NOHZ 7 #define TIF_SYSCALL_TRACE 8 #define TIF_SYSCALL_AUDIT 9 @@ -98,6 +99,7 @@ struct thread_info { #define _TIF_N
[PATCH v4 3/4] GHES: add a notify chain for process memory section
Add a notify chain for process memory section, with which other modules might do error recovery. Signed-off-by: Xie XiuQi <xiexi...@huawei.com> --- drivers/acpi/apei/ghes.c | 10 ++ include/acpi/ghes.h | 8 2 files changed, 18 insertions(+) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 2abf673..56b7336 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -108,6 +108,9 @@ static inline bool is_hest_type_generic_v2(struct ghes *ghes) static LIST_HEAD(ghes_hed); static DEFINE_MUTEX(ghes_list_mutex); +ATOMIC_NOTIFIER_HEAD(ghes_mem_err_chain); +EXPORT_SYMBOL(ghes_mem_err_chain); + /* * Because the memory area used to transfer hardware error information * from BIOS to Linux can be determined only in NMI, IRQ or timer @@ -485,6 +488,13 @@ static void ghes_do_proc(struct ghes *ghes, if (guid_equal(sec_type, _SEC_PLATFORM_MEM)) { struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); + struct ghes_mem_err mem; + + mem.notify_type = ghes->generic->notify.type; + mem.severity = gdata->error_severity; + mem.mem_err = mem_err; + + atomic_notifier_call_chain(_mem_err_chain, 0, ); ghes_edac_report_mem_error(ghes, sev, mem_err); diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h index 8bcb8e5..ab51e40 100644 --- a/include/acpi/ghes.h +++ b/include/acpi/ghes.h @@ -123,4 +123,12 @@ static inline void *acpi_hest_get_next(struct acpi_hest_generic_data *gdata) extern void ghes_arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm *err); +struct ghes_mem_err { + int notify_type; + int severity; + struct cper_sec_mem_err *mem_err; +}; + +extern struct atomic_notifier_head ghes_mem_err_chain; + #endif /* GHES_H */ -- 1.8.3.1
[PATCH v4 1/4] ACPI, CPER: add arm error info type definition
Add arm error info type definition according to ACPI6.1 Table 261. ARM Processor Error Information Structure, which is used for error recovery in the following patches. Signed-off-by: Xie XiuQi --- include/linux/cper.h | 5 + 1 file changed, 5 insertions(+) diff --git a/include/linux/cper.h b/include/linux/cper.h index 723e952..9106e2b 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -275,6 +275,11 @@ enum { #define CPER_ARM_INFO_FLAGS_PROPAGATED BIT(2) #define CPER_ARM_INFO_FLAGS_OVERFLOW BIT(3) +#define CPER_ARM_INFO_TYPE_CACHE 0 +#define CPER_ARM_INFO_TYPE_TLB 1 +#define CPER_ARM_INFO_TYPE_BUS 2 +#define CPER_ARM_INFO_TYPE_UARCH 3 + /* * All tables and structs must be byte-packed to match CPER * specification, since the tables are provided by the system BIOS -- 1.8.3.1
[PATCH v4 2/4] arm64/ras: support sea error recovery
With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors are consumed. According to the existing process, errors occurred in the kernel, leading to direct panic, if it occurred the user-space, we should just kill process. But there is a class of error, in fact, is not necessary to kill process, you can recover and continue to run the process. Such as the instruction data corrupted, where the memory page might be read-only, which is has not been modified, the disk might have the correct data, so you can directly drop the page, ant reload it when necessary. So this patchset is just try to solve such problem: if the error is consumed in user-space and the error occurs on a clean page, you can directly drop the memory page without killing process. If the corrupted page is clean, just dropped it and return to user-space without side effects. And if corrupted page is dirty, memory_failure() will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset, do_sea() will just send SIGBUS, so the process was killed in the same place. Because memory_failure() may sleep, we can not call it directly in SEA exception context. So we saved faulting physical address associated with a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return from SEA exception context and get into do_notify_resume() before the process running, we could check it and call memory_failure() to do recovery. It's safe, because we are in process context. Signed-off-by: Xie XiuQi Tested-by: Wang Xiongfeng Cc: James Morse Cc: Borislav Petkov Cc: Julien Thierry Cc: Hanjun Guo --- arch/arm64/Kconfig | 11 +++ arch/arm64/include/asm/ras.h | 23 ++ arch/arm64/include/asm/thread_info.h | 4 +- arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/ras.c | 142 +++ arch/arm64/kernel/signal.c | 7 ++ arch/arm64/mm/fault.c| 27 +-- drivers/acpi/apei/ghes.c | 8 +- include/acpi/ghes.h | 3 + 9 files changed, 216 insertions(+), 10 deletions(-) create mode 100644 arch/arm64/include/asm/ras.h create mode 100644 arch/arm64/kernel/ras.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 0df64a6..0c2d0a0 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -641,6 +641,17 @@ config HOTPLUG_CPU Say Y here to experiment with turning CPUs off and on. CPUs can be controlled through /sys/devices/system/cpu. +config ARM64_ERR_RECOV + bool "Support arm64 RAS error recovery" + depends on ACPI_APEI_SEA && MEMORY_FAILURE + help + With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors + are consumed. In some cases, if the error address is in a clean page or a + read-only page, there is a chance to recover. Such as error occurs in a + instruction page, we can reread this page from disk instead of killing process. + + Say Y if unsure. + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support" diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h new file mode 100644 index 000..f0f18da --- /dev/null +++ b/arch/arm64/include/asm/ras.h @@ -0,0 +1,23 @@ +/* + * ARM64 SEA error recoery support + * + * Copyright 2017 Huawei Technologies Co., Ltd. + * Author: Xie XiuQi + * Author: Wang Xiongfeng + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation; + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _ASM_RAS_H +#define _ASM_RAS_H + +extern void sea_notify_process(void); + +#endif /*_ASM_RAS_H*/ diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index ddded64..b31b308 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -82,6 +82,7 @@ struct thread_info { #define TIF_FOREIGN_FPSTATE3 /* CPU's FP state is not current's */ #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ #define TIF_FSCHECK5 /* Check FS is USER_DS on return */ +#define TIF_SEA_NOTIFY 6 /* notify to do an error recovery */ #define TIF_NOHZ 7 #define TIF_SYSCALL_TRACE 8 #define TIF_SYSCALL_AUDIT 9 @@ -98,6 +99,7 @@ struct thread_info { #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE) #define _TIF_NOHZ (1 << TIF_NOHZ) +#define _TIF_SEA_NOTIFY (1 << TIF_SEA_NOTIFY) #d
[PATCH v4 3/4] GHES: add a notify chain for process memory section
Add a notify chain for process memory section, with which other modules might do error recovery. Signed-off-by: Xie XiuQi --- drivers/acpi/apei/ghes.c | 10 ++ include/acpi/ghes.h | 8 2 files changed, 18 insertions(+) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 2abf673..56b7336 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -108,6 +108,9 @@ static inline bool is_hest_type_generic_v2(struct ghes *ghes) static LIST_HEAD(ghes_hed); static DEFINE_MUTEX(ghes_list_mutex); +ATOMIC_NOTIFIER_HEAD(ghes_mem_err_chain); +EXPORT_SYMBOL(ghes_mem_err_chain); + /* * Because the memory area used to transfer hardware error information * from BIOS to Linux can be determined only in NMI, IRQ or timer @@ -485,6 +488,13 @@ static void ghes_do_proc(struct ghes *ghes, if (guid_equal(sec_type, _SEC_PLATFORM_MEM)) { struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); + struct ghes_mem_err mem; + + mem.notify_type = ghes->generic->notify.type; + mem.severity = gdata->error_severity; + mem.mem_err = mem_err; + + atomic_notifier_call_chain(_mem_err_chain, 0, ); ghes_edac_report_mem_error(ghes, sev, mem_err); diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h index 8bcb8e5..ab51e40 100644 --- a/include/acpi/ghes.h +++ b/include/acpi/ghes.h @@ -123,4 +123,12 @@ static inline void *acpi_hest_get_next(struct acpi_hest_generic_data *gdata) extern void ghes_arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm *err); +struct ghes_mem_err { + int notify_type; + int severity; + struct cper_sec_mem_err *mem_err; +}; + +extern struct atomic_notifier_head ghes_mem_err_chain; + #endif /* GHES_H */ -- 1.8.3.1
Re: [PATCH v3 1/3] arm64/ras: support sea error recovery
Hi James, I try to explain some of following questions. On 2017/9/16 2:33, James Morse wrote: > Hi Xie XiuQi, > > On 11/09/17 15:11, Xie XiuQi wrote: >> I first describe the approach of this patchset: >> >> A memory access error on the execution path usually triggers SEA. >> According to the existing process, errors occurred in the kernel, >> leading to direct panic, if it occurred the user-space, we should >> just kill process. >> >> But there is a class of error, in fact, is not necessary to kill >> process, you can recover and continue to run the process. Such as >> the instruction data corrupted, where the memory page might be >> read-only, which is has not been modified, the disk might have the >> correct data, so you can directly drop the page, ant reload it when >> necessary. >> >> So this patchset is just try to solve such problem: if the error is >> consumed in user-space and the error occurs on a clean page, you can >> directly drop the memory page without killing process. >> >> This is implemented in memory_failure, which is generic process. > >> The error reported by SEA should be handled before re-enter the process, >> or we must kill the process to prevent error propagation. >> >> memory_failure_queue() is asynchronous, in which, error info was saved >> at ghes_proc, but handled in kworker. During this period there is a context >> switching, so we can not determine which process would be switch to. So >> memory_failure_queue is not suitable for handling the problem. > > Thanks for this summary. I see the problem you're trying to solve is when > memory_failure() runs, in your scenario its not guaranteed to run before we > return to user space. > > What is the user-visible symptom of this? SIGBUS, code=0 instead of SIGBUS, > code=...MCEERR_A? If the corrupted page is clean, just dropped it and return to user-space without side effects. And if corrupted page is dirty, memory_failure() will send SIGBUS with code=BUS_MCEERR_AR. > > ..in which case I'm looking at this as a race with the memory_failure() bottom > half via schedule_work(). Do you mean there is a race between memory_failure() from do_notify_resume() and from schedule_work()? Indeed there a race here, thank you for pointing out that. Actually, memory_failure_queue() is not needed in SEA case if we process it in synchronous. I'll try to solve this issue later. > > How does x86 avoid this same problem? > > >> And memory_failure is not nmi-safe, so it can not be called directly in the >> SEA context. So we just handle this error at SEA exit path, and before >> context >> switching. > > (I need to look more into which locks memory_failure() is taking) > > >> In FFH mode, physical address can only be obtained by parsing the GHES table. >> But we only care about SEA, so the error handling is tied to the type of >> notification. > > I care about all the notification methods. Once the notification has been > passed > to APEI I want them to behave the same so that we don't have subtle bugs > between > the 11 different ways we could get a notification. This code is rarely tested > enough as it is. > >>From the arch code I just want to call out to APEI asking 'is this yours?'. If > so I expect APEI to have done all the work, if not we take the v8.0 behaviour. > > > Here you need APEI and the arch code to spot 'SEA' and treat it differently, > invoking some arm64-specific behaviour for APEI, and some > not-really-arch-specific code under /arch/arm64. There is nothing arm64 > specific > about your arm_process_error(), how come the core APEI code doesn't need to > do this? > > > I think this is caused by the way memory_failure() schedules its work, and > that > is where I'd like to try and fix this, so that its the same for all > notification > methods and all (cough: both) architectures. I try to see if there is a better way. > > >> The TIF flag is checked on a generic path, but it will only be set when SEA >> occurs. >> And if we use unlikely optimization, it should have little impact on >> performance. > > Yes, the arch code checks _TIF_WORK_MASK in one go so there is no performance > problem for code that hasn't taken the RAS-Error. (and once we've taken a RAS > error performance is out the window!) > >> And the TIF flag approach was used on x86 platform for years, until commit >> d4812e169d > > ... so x86 doesn't do this ... > >> (x86, mce: Get rid of TIF_MCE_NOTIFY and associated mce tricks)[0]. On >> currently arm64 >> platform, there is no IST interrupt[1] function, so we could not call >> memory_failure >> directly in SEA context. So the way to use TIF notification, is also a good >> choice, >> after all, the same way on x86 platform is verified. > > Thanks, looks like I need to read more of the history of x86's kernel-first > handling... > > > Thanks, > > James > > > . > -- Thanks, Xie XiuQi
Re: [PATCH v3 1/3] arm64/ras: support sea error recovery
Hi James, I try to explain some of following questions. On 2017/9/16 2:33, James Morse wrote: > Hi Xie XiuQi, > > On 11/09/17 15:11, Xie XiuQi wrote: >> I first describe the approach of this patchset: >> >> A memory access error on the execution path usually triggers SEA. >> According to the existing process, errors occurred in the kernel, >> leading to direct panic, if it occurred the user-space, we should >> just kill process. >> >> But there is a class of error, in fact, is not necessary to kill >> process, you can recover and continue to run the process. Such as >> the instruction data corrupted, where the memory page might be >> read-only, which is has not been modified, the disk might have the >> correct data, so you can directly drop the page, ant reload it when >> necessary. >> >> So this patchset is just try to solve such problem: if the error is >> consumed in user-space and the error occurs on a clean page, you can >> directly drop the memory page without killing process. >> >> This is implemented in memory_failure, which is generic process. > >> The error reported by SEA should be handled before re-enter the process, >> or we must kill the process to prevent error propagation. >> >> memory_failure_queue() is asynchronous, in which, error info was saved >> at ghes_proc, but handled in kworker. During this period there is a context >> switching, so we can not determine which process would be switch to. So >> memory_failure_queue is not suitable for handling the problem. > > Thanks for this summary. I see the problem you're trying to solve is when > memory_failure() runs, in your scenario its not guaranteed to run before we > return to user space. > > What is the user-visible symptom of this? SIGBUS, code=0 instead of SIGBUS, > code=...MCEERR_A? If the corrupted page is clean, just dropped it and return to user-space without side effects. And if corrupted page is dirty, memory_failure() will send SIGBUS with code=BUS_MCEERR_AR. > > ..in which case I'm looking at this as a race with the memory_failure() bottom > half via schedule_work(). Do you mean there is a race between memory_failure() from do_notify_resume() and from schedule_work()? Indeed there a race here, thank you for pointing out that. Actually, memory_failure_queue() is not needed in SEA case if we process it in synchronous. I'll try to solve this issue later. > > How does x86 avoid this same problem? > > >> And memory_failure is not nmi-safe, so it can not be called directly in the >> SEA context. So we just handle this error at SEA exit path, and before >> context >> switching. > > (I need to look more into which locks memory_failure() is taking) > > >> In FFH mode, physical address can only be obtained by parsing the GHES table. >> But we only care about SEA, so the error handling is tied to the type of >> notification. > > I care about all the notification methods. Once the notification has been > passed > to APEI I want them to behave the same so that we don't have subtle bugs > between > the 11 different ways we could get a notification. This code is rarely tested > enough as it is. > >>From the arch code I just want to call out to APEI asking 'is this yours?'. If > so I expect APEI to have done all the work, if not we take the v8.0 behaviour. > > > Here you need APEI and the arch code to spot 'SEA' and treat it differently, > invoking some arm64-specific behaviour for APEI, and some > not-really-arch-specific code under /arch/arm64. There is nothing arm64 > specific > about your arm_process_error(), how come the core APEI code doesn't need to > do this? > > > I think this is caused by the way memory_failure() schedules its work, and > that > is where I'd like to try and fix this, so that its the same for all > notification > methods and all (cough: both) architectures. I try to see if there is a better way. > > >> The TIF flag is checked on a generic path, but it will only be set when SEA >> occurs. >> And if we use unlikely optimization, it should have little impact on >> performance. > > Yes, the arch code checks _TIF_WORK_MASK in one go so there is no performance > problem for code that hasn't taken the RAS-Error. (and once we've taken a RAS > error performance is out the window!) > >> And the TIF flag approach was used on x86 platform for years, until commit >> d4812e169d > > ... so x86 doesn't do this ... > >> (x86, mce: Get rid of TIF_MCE_NOTIFY and associated mce tricks)[0]. On >> currently arm64 >> platform, there is no IST interrupt[1] function, so we could not call >> memory_failure >> directly in SEA context. So the way to use TIF notification, is also a good >> choice, >> after all, the same way on x86 platform is verified. > > Thanks, looks like I need to read more of the history of x86's kernel-first > handling... > > > Thanks, > > James > > > . > -- Thanks, Xie XiuQi
Re: [PATCH v3 1/3] arm64/ras: support sea error recovery
Hi James, Thank you very much for your carefully review. I first describe the approach of this patchset: A memory access error on the execution path usually triggers SEA. According to the existing process, errors occurred in the kernel, leading to direct panic, if it occurred the user-space, we should just kill process. But there is a class of error, in fact, is not necessary to kill process, you can recover and continue to run the process. Such as the instruction data corrupted, where the memory page might be read-only, which is has not been modified, the disk might have the correct data, so you can directly drop the page, ant reload it when necessary. So this patchset is just try to solve such problem: if the error is consumed in user-space and the error occurs on a clean page, you can directly drop the memory page without killing process. This is implemented in memory_failure, which is generic process. memory_failure -> hwpoison_user_mappings /* * Propagate the dirty bit from PTEs to struct page first, because we * need this to decide if we should kill or just drop the page. * XXX: the dirty test could be racy: set_page_dirty() may not always * be called inside page lock (it's recommended but not enforced). */ mapping = page_mapping(hpage); if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping && mapping_cap_writeback_dirty(mapping)) { if (page_mkclean(hpage)) { SetPageDirty(hpage); } else { kill = 0; ttu |= TTU_IGNORE_HWPOISON; pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n", pfn); } } The error reported by SEA should be handled before re-enter the process, or we must kill the process to prevent error propagation. memory_failure_queue() is asynchronous, in which, error info was saved at ghes_proc, but handled in kworker. During this period there is a context switching, so we can not determine which process would be switch to. So memory_failure_queue is not suitable for handling the problem. And memory_failure is not nmi-safe, so it can not be called directly in the SEA context. So we just handle this error at SEA exit path, and before context switching. In FFH mode, physical address can only be obtained by parsing the GHES table. But we only care about SEA, so the error handling is tied to the type of notification. The TIF flag is checked on a generic path, but it will only be set when SEA occurs. And if we use unlikely optimization, it should have little impact on performance. And the TIF flag approach was used on x86 platform for years, until commit d4812e169d (x86, mce: Get rid of TIF_MCE_NOTIFY and associated mce tricks)[0]. On currently arm64 platform, there is no IST interrupt[1] function, so we could not call memory_failure directly in SEA context. So the way to use TIF notification, is also a good choice, after all, the same way on x86 platform is verified. Any comment is welcome, thanks. [0] https://patchwork.kernel.org/patch/5571021/ [1] [PATCH v4 0/5] x86: Rework IST interrupts https://lkml.org/lkml/2014/11/21/632 On 2017/9/9 2:15, James Morse wrote: > Hi Xie XiuQi, > > (Sorry a few versions of this went past before I caught up with it) > > On 07/09/17 08:45, Xie XiuQi wrote: >> With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors >> are consumed. In some cases, if the error address is in a clean page or a >> read-only page, there is a chance to recover. Such as error occurs in a >> instruction page, we can reread this page from disk instead of killing >> process. > >> Because memory_failure() may sleep, we can not call it directly in SEA >> exception context. > > This is why we have memory_failure_queue() instead, it ... bother. That > doesn't > look nmi-safe. (I thought this ended with an llist, but clearly I was looking > at > the wrong thing). > > It doesn't look like this is a problem for NOTIFY_SEA as it would only > interrupt > itself on the same CPU if the memory-failure code/data were corrupt. (which is > not a case we can handle). We need to fix this before any of the asynchronous > NMI-like RAS notifications for arm64 get merged. > > (this is one problem, but I don't think its 'the' problem you are trying to > solve with this series). > > >> So we saved faulting physical address associated with >> a process in the ghes handler and set __TIF_SEA_NOTIFY. > > A per-notification type TIF flag looks fishy, surely this would affect all > NMI-like RAS notification methods? > > >> When we return >> from SEA exception context and
Re: [PATCH v3 1/3] arm64/ras: support sea error recovery
Hi James, Thank you very much for your carefully review. I first describe the approach of this patchset: A memory access error on the execution path usually triggers SEA. According to the existing process, errors occurred in the kernel, leading to direct panic, if it occurred the user-space, we should just kill process. But there is a class of error, in fact, is not necessary to kill process, you can recover and continue to run the process. Such as the instruction data corrupted, where the memory page might be read-only, which is has not been modified, the disk might have the correct data, so you can directly drop the page, ant reload it when necessary. So this patchset is just try to solve such problem: if the error is consumed in user-space and the error occurs on a clean page, you can directly drop the memory page without killing process. This is implemented in memory_failure, which is generic process. memory_failure -> hwpoison_user_mappings /* * Propagate the dirty bit from PTEs to struct page first, because we * need this to decide if we should kill or just drop the page. * XXX: the dirty test could be racy: set_page_dirty() may not always * be called inside page lock (it's recommended but not enforced). */ mapping = page_mapping(hpage); if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping && mapping_cap_writeback_dirty(mapping)) { if (page_mkclean(hpage)) { SetPageDirty(hpage); } else { kill = 0; ttu |= TTU_IGNORE_HWPOISON; pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n", pfn); } } The error reported by SEA should be handled before re-enter the process, or we must kill the process to prevent error propagation. memory_failure_queue() is asynchronous, in which, error info was saved at ghes_proc, but handled in kworker. During this period there is a context switching, so we can not determine which process would be switch to. So memory_failure_queue is not suitable for handling the problem. And memory_failure is not nmi-safe, so it can not be called directly in the SEA context. So we just handle this error at SEA exit path, and before context switching. In FFH mode, physical address can only be obtained by parsing the GHES table. But we only care about SEA, so the error handling is tied to the type of notification. The TIF flag is checked on a generic path, but it will only be set when SEA occurs. And if we use unlikely optimization, it should have little impact on performance. And the TIF flag approach was used on x86 platform for years, until commit d4812e169d (x86, mce: Get rid of TIF_MCE_NOTIFY and associated mce tricks)[0]. On currently arm64 platform, there is no IST interrupt[1] function, so we could not call memory_failure directly in SEA context. So the way to use TIF notification, is also a good choice, after all, the same way on x86 platform is verified. Any comment is welcome, thanks. [0] https://patchwork.kernel.org/patch/5571021/ [1] [PATCH v4 0/5] x86: Rework IST interrupts https://lkml.org/lkml/2014/11/21/632 On 2017/9/9 2:15, James Morse wrote: > Hi Xie XiuQi, > > (Sorry a few versions of this went past before I caught up with it) > > On 07/09/17 08:45, Xie XiuQi wrote: >> With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors >> are consumed. In some cases, if the error address is in a clean page or a >> read-only page, there is a chance to recover. Such as error occurs in a >> instruction page, we can reread this page from disk instead of killing >> process. > >> Because memory_failure() may sleep, we can not call it directly in SEA >> exception context. > > This is why we have memory_failure_queue() instead, it ... bother. That > doesn't > look nmi-safe. (I thought this ended with an llist, but clearly I was looking > at > the wrong thing). > > It doesn't look like this is a problem for NOTIFY_SEA as it would only > interrupt > itself on the same CPU if the memory-failure code/data were corrupt. (which is > not a case we can handle). We need to fix this before any of the asynchronous > NMI-like RAS notifications for arm64 get merged. > > (this is one problem, but I don't think its 'the' problem you are trying to > solve with this series). > > >> So we saved faulting physical address associated with >> a process in the ghes handler and set __TIF_SEA_NOTIFY. > > A per-notification type TIF flag looks fishy, surely this would affect all > NMI-like RAS notification methods? > > >> When we return >> from SEA exception context and
[PATCH v3 3/3] arm64/ras: save error address from memory section for recovery
In some platform, when SEA triggerred, physical address might be reported by memory section, so we save it for error recovery later. Signed-off-by: Xie XiuQi <xiexi...@huawei.com> --- arch/arm64/kernel/ras.c | 31 +++ 1 file changed, 31 insertions(+) diff --git a/arch/arm64/kernel/ras.c b/arch/arm64/kernel/ras.c index 797722d..b24b449 100644 --- a/arch/arm64/kernel/ras.c +++ b/arch/arm64/kernel/ras.c @@ -139,3 +139,34 @@ void arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm *err) if (info_saved) set_thread_flag(TIF_SEA_NOTIFY); } + +int ghes_mem_err_callback(struct notifier_block *nb, unsigned long val, void *data) +{ + bool info_saved = false; + struct ghes_mem_err *ghes_mem = (struct ghes_mem_err *)data; + struct cper_sec_mem_err *mem_err = ghes_mem->mem_err; + + if ((ghes_mem->notify_type != ACPI_HEST_NOTIFY_SEA) || + (ghes_mem->severity != CPER_SEV_RECOVERABLE)) + return 0; + + if (mem_err->validation_bits & CPER_MEM_VALID_PA) + info_saved = sea_save_info(mem_err->physical_addr); + + if (info_saved) + set_thread_flag(TIF_SEA_NOTIFY); + + return 0; +} + +static struct notifier_block ghes_mem_err_nb = { + .notifier_call = ghes_mem_err_callback, +}; + +static int arm64_err_recov_init(void) +{ + atomic_notifier_chain_register(_mem_err_chain, _mem_err_nb); + return 0; +} + +late_initcall(arm64_err_recov_init); -- 1.8.3.1
[PATCH v3 3/3] arm64/ras: save error address from memory section for recovery
In some platform, when SEA triggerred, physical address might be reported by memory section, so we save it for error recovery later. Signed-off-by: Xie XiuQi --- arch/arm64/kernel/ras.c | 31 +++ 1 file changed, 31 insertions(+) diff --git a/arch/arm64/kernel/ras.c b/arch/arm64/kernel/ras.c index 797722d..b24b449 100644 --- a/arch/arm64/kernel/ras.c +++ b/arch/arm64/kernel/ras.c @@ -139,3 +139,34 @@ void arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm *err) if (info_saved) set_thread_flag(TIF_SEA_NOTIFY); } + +int ghes_mem_err_callback(struct notifier_block *nb, unsigned long val, void *data) +{ + bool info_saved = false; + struct ghes_mem_err *ghes_mem = (struct ghes_mem_err *)data; + struct cper_sec_mem_err *mem_err = ghes_mem->mem_err; + + if ((ghes_mem->notify_type != ACPI_HEST_NOTIFY_SEA) || + (ghes_mem->severity != CPER_SEV_RECOVERABLE)) + return 0; + + if (mem_err->validation_bits & CPER_MEM_VALID_PA) + info_saved = sea_save_info(mem_err->physical_addr); + + if (info_saved) + set_thread_flag(TIF_SEA_NOTIFY); + + return 0; +} + +static struct notifier_block ghes_mem_err_nb = { + .notifier_call = ghes_mem_err_callback, +}; + +static int arm64_err_recov_init(void) +{ + atomic_notifier_chain_register(_mem_err_chain, _mem_err_nb); + return 0; +} + +late_initcall(arm64_err_recov_init); -- 1.8.3.1
[PATCH v3 1/3] arm64/ras: support sea error recovery
With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors are consumed. In some cases, if the error address is in a clean page or a read-only page, there is a chance to recover. Such as error occurs in a instruction page, we can reread this page from disk instead of killing process. Because memory_failure() may sleep, we can not call it directly in SEA exception context. So we saved faulting physical address associated with a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return from SEA exception context and get into do_notify_resume() before the process running, we could check it and call memory_failure() to do recovery. It's safe, because we are in process context. Signed-off-by: Xie XiuQi <xiexi...@huawei.com> Signed-off-by: Wang Xiongfeng <wangxiongfe...@huawei.com> --- arch/arm64/Kconfig | 11 +++ arch/arm64/include/asm/ras.h | 36 + arch/arm64/include/asm/thread_info.h | 4 +- arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/ras.c | 141 +++ arch/arm64/kernel/signal.c | 8 ++ arch/arm64/mm/fault.c| 27 +-- drivers/acpi/apei/ghes.c | 4 +- 8 files changed, 221 insertions(+), 11 deletions(-) create mode 100644 arch/arm64/include/asm/ras.h create mode 100644 arch/arm64/kernel/ras.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index dfd9086..7d44589 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -640,6 +640,17 @@ config HOTPLUG_CPU Say Y here to experiment with turning CPUs off and on. CPUs can be controlled through /sys/devices/system/cpu. +config ARM64_ERR_RECOV + bool "Support arm64 RAS error recovery" + depends on ACPI_APEI_SEA && MEMORY_FAILURE + help + With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors + are consumed. In some cases, if the error address is in a clean page or a + read-only page, there is a chance to recover. Such as error occurs in a + instruction page, we can reread this page from disk instead of killing process. + + Say Y if unsure. + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support" diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h new file mode 100644 index 000..e174f95 --- /dev/null +++ b/arch/arm64/include/asm/ras.h @@ -0,0 +1,36 @@ +/* + * ARM64 SEA error recoery support + * + * Copyright 2017 Huawei Technologies Co., Ltd. + * Author: Xie XiuQi <xiexi...@huawei.com> + * Author: Wang Xiongfeng <wangxiongfe...@huawei.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation; + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _ASM_RAS_H +#define _ASM_RAS_H + +#include +#include +#include + +extern void sea_notify_process(void); + +#ifdef CONFIG_ARM64_ERR_RECOV +extern void arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm *err); +#else +static inline void arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm *err) +{ + log_arm_hw_error(err); +} +#endif /* CONFIG_ARM64_ERR_RECOV */ + +#endif /*_ASM_RAS_H*/ diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 46c3b93..4b10131 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -86,6 +86,7 @@ struct thread_info { #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_FOREIGN_FPSTATE3 /* CPU's FP state is not current's */ #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ +#define TIF_SEA_NOTIFY 5 /* notify to do an error recovery */ #define TIF_NOHZ 7 #define TIF_SYSCALL_TRACE 8 #define TIF_SYSCALL_AUDIT 9 @@ -102,6 +103,7 @@ struct thread_info { #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE) #define _TIF_NOHZ (1 << TIF_NOHZ) +#define _TIF_SEA_NOTIFY (1 << TIF_SEA_NOTIFY) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SYSCALL_TRACEPOINT(1 << TIF_SYSCALL_TRACEPOINT) @@ -111,7 +113,7 @@ struct thread_info { #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ -
[PATCH v3 1/3] arm64/ras: support sea error recovery
With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors are consumed. In some cases, if the error address is in a clean page or a read-only page, there is a chance to recover. Such as error occurs in a instruction page, we can reread this page from disk instead of killing process. Because memory_failure() may sleep, we can not call it directly in SEA exception context. So we saved faulting physical address associated with a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return from SEA exception context and get into do_notify_resume() before the process running, we could check it and call memory_failure() to do recovery. It's safe, because we are in process context. Signed-off-by: Xie XiuQi Signed-off-by: Wang Xiongfeng --- arch/arm64/Kconfig | 11 +++ arch/arm64/include/asm/ras.h | 36 + arch/arm64/include/asm/thread_info.h | 4 +- arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/ras.c | 141 +++ arch/arm64/kernel/signal.c | 8 ++ arch/arm64/mm/fault.c| 27 +-- drivers/acpi/apei/ghes.c | 4 +- 8 files changed, 221 insertions(+), 11 deletions(-) create mode 100644 arch/arm64/include/asm/ras.h create mode 100644 arch/arm64/kernel/ras.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index dfd9086..7d44589 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -640,6 +640,17 @@ config HOTPLUG_CPU Say Y here to experiment with turning CPUs off and on. CPUs can be controlled through /sys/devices/system/cpu. +config ARM64_ERR_RECOV + bool "Support arm64 RAS error recovery" + depends on ACPI_APEI_SEA && MEMORY_FAILURE + help + With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors + are consumed. In some cases, if the error address is in a clean page or a + read-only page, there is a chance to recover. Such as error occurs in a + instruction page, we can reread this page from disk instead of killing process. + + Say Y if unsure. + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support" diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h new file mode 100644 index 000..e174f95 --- /dev/null +++ b/arch/arm64/include/asm/ras.h @@ -0,0 +1,36 @@ +/* + * ARM64 SEA error recoery support + * + * Copyright 2017 Huawei Technologies Co., Ltd. + * Author: Xie XiuQi + * Author: Wang Xiongfeng + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation; + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _ASM_RAS_H +#define _ASM_RAS_H + +#include +#include +#include + +extern void sea_notify_process(void); + +#ifdef CONFIG_ARM64_ERR_RECOV +extern void arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm *err); +#else +static inline void arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm *err) +{ + log_arm_hw_error(err); +} +#endif /* CONFIG_ARM64_ERR_RECOV */ + +#endif /*_ASM_RAS_H*/ diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 46c3b93..4b10131 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -86,6 +86,7 @@ struct thread_info { #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_FOREIGN_FPSTATE3 /* CPU's FP state is not current's */ #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ +#define TIF_SEA_NOTIFY 5 /* notify to do an error recovery */ #define TIF_NOHZ 7 #define TIF_SYSCALL_TRACE 8 #define TIF_SYSCALL_AUDIT 9 @@ -102,6 +103,7 @@ struct thread_info { #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE) #define _TIF_NOHZ (1 << TIF_NOHZ) +#define _TIF_SEA_NOTIFY (1 << TIF_SEA_NOTIFY) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SYSCALL_TRACEPOINT(1 << TIF_SYSCALL_TRACEPOINT) @@ -111,7 +113,7 @@ struct thread_info { #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ -_TIF_UPROBE) +_TIF_UPROBE|_TIF_SEA_NOTIFY) #
[PATCH v3 2/3] GHES: add a notify chain for process memory section
Add a notify chain for process memory section, with which other modules might do error recovery. Signed-off-by: Xie XiuQi <xiexi...@huawei.com> --- drivers/acpi/apei/ghes.c | 10 ++ include/acpi/ghes.h | 8 2 files changed, 18 insertions(+) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 502335c..5b310b7 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -108,6 +108,9 @@ static inline bool is_hest_type_generic_v2(struct ghes *ghes) static LIST_HEAD(ghes_hed); static DEFINE_MUTEX(ghes_list_mutex); +ATOMIC_NOTIFIER_HEAD(ghes_mem_err_chain); +EXPORT_SYMBOL(ghes_mem_err_chain); + /* * Because the memory area used to transfer hardware error information * from BIOS to Linux can be determined only in NMI, IRQ or timer @@ -479,6 +482,13 @@ static void ghes_do_proc(struct ghes *ghes, if (guid_equal(sec_type, _SEC_PLATFORM_MEM)) { struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); + struct ghes_mem_err mem; + + mem.notify_type = ghes->generic->notify.type; + mem.severity = gdata->error_severity; + mem.mem_err = mem_err; + + atomic_notifier_call_chain(_mem_err_chain, 0, ); ghes_edac_report_mem_error(ghes, sev, mem_err); diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h index 9f26e01..df3f6d9 100644 --- a/include/acpi/ghes.h +++ b/include/acpi/ghes.h @@ -115,4 +115,12 @@ static inline void *acpi_hest_get_next(struct acpi_hest_generic_data *gdata) int ghes_notify_sea(void); +struct ghes_mem_err { + int notify_type; + int severity; + struct cper_sec_mem_err *mem_err; +}; + +extern struct atomic_notifier_head ghes_mem_err_chain; + #endif /* GHES_H */ -- 1.8.3.1
[PATCH v3 0/3] arm64/ras: support sea error recovery
With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors are consumed. In some cases, if the error address is in a clean page or a read-only page, there is a chance to recover. Such as error occurs in a instruction page, we can reread this page from disk instead of killing process. Because memory_failure() may sleep, we can not call it directly in SEA exception context. So we saved faulting physical address associated with a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return from SEA exception context and get into do_notify_resume() before the process running, we could check it and call memory_failure() to do recovery. It's safe, because we are in process context. In some platform, when SEA triggerred, physical address could be reported by memory section or by processor section, so we save address at this two place. --- v3 - v2: - fix patch style issue v2 - v1: - wrap arm_proc_error_check and log_arm_hw_error in a single arm_process_error() - fix sea_save_info return value issue - fix link error if this CONFIG_ARM64_ERR_RECOV is not selected - use a notify chain instead of call arch_apei_report_mem_error directly https://lkml.org/lkml/2017/9/1/189 Xie XiuQi (3): arm64/ras: support sea error recovery GHES: add a notify chain for process memory section arm64/ras: save error address from memory section for recovery arch/arm64/Kconfig | 11 +++ arch/arm64/include/asm/ras.h | 36 arch/arm64/include/asm/thread_info.h | 4 +- arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/ras.c | 172 +++ arch/arm64/kernel/signal.c | 8 ++ arch/arm64/mm/fault.c| 27 -- drivers/acpi/apei/ghes.c | 14 ++- include/acpi/ghes.h | 8 ++ 9 files changed, 270 insertions(+), 11 deletions(-) create mode 100644 arch/arm64/include/asm/ras.h create mode 100644 arch/arm64/kernel/ras.c -- 1.8.3.1
[PATCH v3 2/3] GHES: add a notify chain for process memory section
Add a notify chain for process memory section, with which other modules might do error recovery. Signed-off-by: Xie XiuQi --- drivers/acpi/apei/ghes.c | 10 ++ include/acpi/ghes.h | 8 2 files changed, 18 insertions(+) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 502335c..5b310b7 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -108,6 +108,9 @@ static inline bool is_hest_type_generic_v2(struct ghes *ghes) static LIST_HEAD(ghes_hed); static DEFINE_MUTEX(ghes_list_mutex); +ATOMIC_NOTIFIER_HEAD(ghes_mem_err_chain); +EXPORT_SYMBOL(ghes_mem_err_chain); + /* * Because the memory area used to transfer hardware error information * from BIOS to Linux can be determined only in NMI, IRQ or timer @@ -479,6 +482,13 @@ static void ghes_do_proc(struct ghes *ghes, if (guid_equal(sec_type, _SEC_PLATFORM_MEM)) { struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); + struct ghes_mem_err mem; + + mem.notify_type = ghes->generic->notify.type; + mem.severity = gdata->error_severity; + mem.mem_err = mem_err; + + atomic_notifier_call_chain(_mem_err_chain, 0, ); ghes_edac_report_mem_error(ghes, sev, mem_err); diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h index 9f26e01..df3f6d9 100644 --- a/include/acpi/ghes.h +++ b/include/acpi/ghes.h @@ -115,4 +115,12 @@ static inline void *acpi_hest_get_next(struct acpi_hest_generic_data *gdata) int ghes_notify_sea(void); +struct ghes_mem_err { + int notify_type; + int severity; + struct cper_sec_mem_err *mem_err; +}; + +extern struct atomic_notifier_head ghes_mem_err_chain; + #endif /* GHES_H */ -- 1.8.3.1
[PATCH v3 0/3] arm64/ras: support sea error recovery
With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors are consumed. In some cases, if the error address is in a clean page or a read-only page, there is a chance to recover. Such as error occurs in a instruction page, we can reread this page from disk instead of killing process. Because memory_failure() may sleep, we can not call it directly in SEA exception context. So we saved faulting physical address associated with a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return from SEA exception context and get into do_notify_resume() before the process running, we could check it and call memory_failure() to do recovery. It's safe, because we are in process context. In some platform, when SEA triggerred, physical address could be reported by memory section or by processor section, so we save address at this two place. --- v3 - v2: - fix patch style issue v2 - v1: - wrap arm_proc_error_check and log_arm_hw_error in a single arm_process_error() - fix sea_save_info return value issue - fix link error if this CONFIG_ARM64_ERR_RECOV is not selected - use a notify chain instead of call arch_apei_report_mem_error directly https://lkml.org/lkml/2017/9/1/189 Xie XiuQi (3): arm64/ras: support sea error recovery GHES: add a notify chain for process memory section arm64/ras: save error address from memory section for recovery arch/arm64/Kconfig | 11 +++ arch/arm64/include/asm/ras.h | 36 arch/arm64/include/asm/thread_info.h | 4 +- arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/ras.c | 172 +++ arch/arm64/kernel/signal.c | 8 ++ arch/arm64/mm/fault.c| 27 -- drivers/acpi/apei/ghes.c | 14 ++- include/acpi/ghes.h | 8 ++ 9 files changed, 270 insertions(+), 11 deletions(-) create mode 100644 arch/arm64/include/asm/ras.h create mode 100644 arch/arm64/kernel/ras.c -- 1.8.3.1
Re: [PATCH v2 1/3] arm64/ras: support sea error recovery
Hi Borislav, On 2017/9/6 18:12, Borislav Petkov wrote: > On Tue, Sep 05, 2017 at 07:06:04PM +0800, Xie XiuQi wrote: >> With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors >> are consumed. In some cases, if the error address is in a clean page or a >> read-only page, there is a chance to recover. Such as error occurs in a >> instruction page, we can reread this page from disk instead of killing >> process. >> >> Because memory_failure() may sleep, we can not call it directly in SEA >> exception >> context. So we saved faulting physical address associated with a process in >> the >> ghes handler and set __TIF_SEA_NOTIFY. When we return from SEA exception >> context >> and get into do_notify_resume() before the process running, we could check it >> and call memory_failure() to do recovery. It's safe, because we are in >> process >> context. >> >> Signed-off-by: Xie XiuQi <xiexi...@huawei.com> >> Signed-off-by: Wang Xiongfeng <wangxiongfe...@huawei.com> >> --- >> arch/arm64/Kconfig | 11 +++ >> arch/arm64/include/asm/ras.h | 36 + >> arch/arm64/include/asm/thread_info.h | 4 +- >> arch/arm64/kernel/Makefile | 1 + >> arch/arm64/kernel/ras.c | 143 >> +++ >> arch/arm64/kernel/signal.c | 8 ++ >> arch/arm64/mm/fault.c| 27 +-- >> drivers/acpi/apei/ghes.c | 4 +- >> 8 files changed, 223 insertions(+), 11 deletions(-) >> create mode 100644 arch/arm64/include/asm/ras.h >> create mode 100644 arch/arm64/kernel/ras.c > > Please integrate scripts/checkpatch.pl into your patch creation workflow > and run all patches through it before submitting: Sorry for my mistake. I'll fix it, thanks. > > ERROR: code indent should use tabs where possible > #200: FILE: arch/arm64/kernel/ras.c:42: > +atomic_tinuse;$ > > WARNING: please, no spaces at the start of a line > #200: FILE: arch/arm64/kernel/ras.c:42: > +atomic_tinuse;$ > > ERROR: code indent should use tabs where possible > #201: FILE: arch/arm64/kernel/ras.c:43: > +struct task_struct *t;$ > > WARNING: please, no spaces at the start of a line > #201: FILE: arch/arm64/kernel/ras.c:43: > +struct task_struct *t;$ > > ERROR: code indent should use tabs where possible > #202: FILE: arch/arm64/kernel/ras.c:44: > +__u64 paddr;$ > > WARNING: please, no spaces at the start of a line > #202: FILE: arch/arm64/kernel/ras.c:44: > +__u64 paddr;$ > > ERROR: code indent should use tabs where possible > #207: FILE: arch/arm64/kernel/ras.c:49: > +struct sea_info *si;$ > > WARNING: please, no spaces at the start of a line > #207: FILE: arch/arm64/kernel/ras.c:49: > +struct sea_info *si;$ > > ERROR: code indent should use tabs where possible > #209: FILE: arch/arm64/kernel/ras.c:51: > +for (si = sea_info; si < _info[SEA_INFO_MAX]; si++) {$ > > WARNING: please, no spaces at the start of a line > #209: FILE: arch/arm64/kernel/ras.c:51: > +for (si = sea_info; si < _info[SEA_INFO_MAX]; si++) {$ > > ERROR: code indent should use tabs where possible > #210: FILE: arch/arm64/kernel/ras.c:52: > +if (atomic_cmpxchg(>inuse, 0, 1) == 0) {$ > > WARNING: please, no spaces at the start of a line > #210: FILE: arch/arm64/kernel/ras.c:52: > +if (atomic_cmpxchg(>inuse, 0, 1) == 0) {$ > > ERROR: code indent should use tabs where possible > #211: FILE: arch/arm64/kernel/ras.c:53: > +si->t = current;$ > > WARNING: please, no spaces at the start of a line > #211: FILE: arch/arm64/kernel/ras.c:53: > +si->t = current;$ > > ERROR: code indent should use tabs where possible > #212: FILE: arch/arm64/kernel/ras.c:54: > +si->paddr = addr;$ > > WARNING: please, no spaces at the start of a line > #212: FILE: arch/arm64/kernel/ras.c:54: > +si->paddr = addr;$ > > ERROR: code indent should use tabs where possible > #213: FILE: arch/arm64/kernel/ras.c:55: > +return true;$ > > WARNING: please, no spaces at the start of a line > #213: FILE: arch/arm64/kernel/ras.c:55: > +return true;$ > > ERROR: code indent should use tabs where possible > #214: FILE: arch/arm64/kernel/ras.c:56: > +}$ > > WARNING: please, no spaces at the st
Re: [PATCH v2 1/3] arm64/ras: support sea error recovery
Hi Borislav, On 2017/9/6 18:12, Borislav Petkov wrote: > On Tue, Sep 05, 2017 at 07:06:04PM +0800, Xie XiuQi wrote: >> With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors >> are consumed. In some cases, if the error address is in a clean page or a >> read-only page, there is a chance to recover. Such as error occurs in a >> instruction page, we can reread this page from disk instead of killing >> process. >> >> Because memory_failure() may sleep, we can not call it directly in SEA >> exception >> context. So we saved faulting physical address associated with a process in >> the >> ghes handler and set __TIF_SEA_NOTIFY. When we return from SEA exception >> context >> and get into do_notify_resume() before the process running, we could check it >> and call memory_failure() to do recovery. It's safe, because we are in >> process >> context. >> >> Signed-off-by: Xie XiuQi >> Signed-off-by: Wang Xiongfeng >> --- >> arch/arm64/Kconfig | 11 +++ >> arch/arm64/include/asm/ras.h | 36 + >> arch/arm64/include/asm/thread_info.h | 4 +- >> arch/arm64/kernel/Makefile | 1 + >> arch/arm64/kernel/ras.c | 143 >> +++ >> arch/arm64/kernel/signal.c | 8 ++ >> arch/arm64/mm/fault.c| 27 +-- >> drivers/acpi/apei/ghes.c | 4 +- >> 8 files changed, 223 insertions(+), 11 deletions(-) >> create mode 100644 arch/arm64/include/asm/ras.h >> create mode 100644 arch/arm64/kernel/ras.c > > Please integrate scripts/checkpatch.pl into your patch creation workflow > and run all patches through it before submitting: Sorry for my mistake. I'll fix it, thanks. > > ERROR: code indent should use tabs where possible > #200: FILE: arch/arm64/kernel/ras.c:42: > +atomic_tinuse;$ > > WARNING: please, no spaces at the start of a line > #200: FILE: arch/arm64/kernel/ras.c:42: > +atomic_tinuse;$ > > ERROR: code indent should use tabs where possible > #201: FILE: arch/arm64/kernel/ras.c:43: > +struct task_struct *t;$ > > WARNING: please, no spaces at the start of a line > #201: FILE: arch/arm64/kernel/ras.c:43: > +struct task_struct *t;$ > > ERROR: code indent should use tabs where possible > #202: FILE: arch/arm64/kernel/ras.c:44: > +__u64 paddr;$ > > WARNING: please, no spaces at the start of a line > #202: FILE: arch/arm64/kernel/ras.c:44: > +__u64 paddr;$ > > ERROR: code indent should use tabs where possible > #207: FILE: arch/arm64/kernel/ras.c:49: > +struct sea_info *si;$ > > WARNING: please, no spaces at the start of a line > #207: FILE: arch/arm64/kernel/ras.c:49: > +struct sea_info *si;$ > > ERROR: code indent should use tabs where possible > #209: FILE: arch/arm64/kernel/ras.c:51: > +for (si = sea_info; si < _info[SEA_INFO_MAX]; si++) {$ > > WARNING: please, no spaces at the start of a line > #209: FILE: arch/arm64/kernel/ras.c:51: > +for (si = sea_info; si < _info[SEA_INFO_MAX]; si++) {$ > > ERROR: code indent should use tabs where possible > #210: FILE: arch/arm64/kernel/ras.c:52: > +if (atomic_cmpxchg(>inuse, 0, 1) == 0) {$ > > WARNING: please, no spaces at the start of a line > #210: FILE: arch/arm64/kernel/ras.c:52: > +if (atomic_cmpxchg(>inuse, 0, 1) == 0) {$ > > ERROR: code indent should use tabs where possible > #211: FILE: arch/arm64/kernel/ras.c:53: > +si->t = current;$ > > WARNING: please, no spaces at the start of a line > #211: FILE: arch/arm64/kernel/ras.c:53: > +si->t = current;$ > > ERROR: code indent should use tabs where possible > #212: FILE: arch/arm64/kernel/ras.c:54: > +si->paddr = addr;$ > > WARNING: please, no spaces at the start of a line > #212: FILE: arch/arm64/kernel/ras.c:54: > +si->paddr = addr;$ > > ERROR: code indent should use tabs where possible > #213: FILE: arch/arm64/kernel/ras.c:55: > +return true;$ > > WARNING: please, no spaces at the start of a line > #213: FILE: arch/arm64/kernel/ras.c:55: > +return true;$ > > ERROR: code indent should use tabs where possible > #214: FILE: arch/arm64/kernel/ras.c:56: > +}$ > > WARNING: please, no spaces at the start of a line > #214: FILE: arch/arm64/kernel/ras.c:56: >
[PATCH v2 1/3] arm64/ras: support sea error recovery
With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors are consumed. In some cases, if the error address is in a clean page or a read-only page, there is a chance to recover. Such as error occurs in a instruction page, we can reread this page from disk instead of killing process. Because memory_failure() may sleep, we can not call it directly in SEA exception context. So we saved faulting physical address associated with a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return from SEA exception context and get into do_notify_resume() before the process running, we could check it and call memory_failure() to do recovery. It's safe, because we are in process context. Signed-off-by: Xie XiuQi <xiexi...@huawei.com> Signed-off-by: Wang Xiongfeng <wangxiongfe...@huawei.com> --- arch/arm64/Kconfig | 11 +++ arch/arm64/include/asm/ras.h | 36 + arch/arm64/include/asm/thread_info.h | 4 +- arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/ras.c | 143 +++ arch/arm64/kernel/signal.c | 8 ++ arch/arm64/mm/fault.c| 27 +-- drivers/acpi/apei/ghes.c | 4 +- 8 files changed, 223 insertions(+), 11 deletions(-) create mode 100644 arch/arm64/include/asm/ras.h create mode 100644 arch/arm64/kernel/ras.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index dfd9086..7d44589 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -640,6 +640,17 @@ config HOTPLUG_CPU Say Y here to experiment with turning CPUs off and on. CPUs can be controlled through /sys/devices/system/cpu. +config ARM64_ERR_RECOV + bool "Support arm64 RAS error recovery" + depends on ACPI_APEI_SEA && MEMORY_FAILURE + help + With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors + are consumed. In some cases, if the error address is in a clean page or a + read-only page, there is a chance to recover. Such as error occurs in a + instruction page, we can reread this page from disk instead of killing process. + + Say Y if unsure. + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support" diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h new file mode 100644 index 000..e174f95 --- /dev/null +++ b/arch/arm64/include/asm/ras.h @@ -0,0 +1,36 @@ +/* + * ARM64 SEA error recoery support + * + * Copyright 2017 Huawei Technologies Co., Ltd. + * Author: Xie XiuQi <xiexi...@huawei.com> + * Author: Wang Xiongfeng <wangxiongfe...@huawei.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation; + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _ASM_RAS_H +#define _ASM_RAS_H + +#include +#include +#include + +extern void sea_notify_process(void); + +#ifdef CONFIG_ARM64_ERR_RECOV +extern void arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm *err); +#else +static inline void arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm *err) +{ + log_arm_hw_error(err); +} +#endif /* CONFIG_ARM64_ERR_RECOV */ + +#endif /*_ASM_RAS_H*/ diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 46c3b93..4b10131 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -86,6 +86,7 @@ struct thread_info { #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_FOREIGN_FPSTATE3 /* CPU's FP state is not current's */ #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ +#define TIF_SEA_NOTIFY 5 /* notify to do an error recovery */ #define TIF_NOHZ 7 #define TIF_SYSCALL_TRACE 8 #define TIF_SYSCALL_AUDIT 9 @@ -102,6 +103,7 @@ struct thread_info { #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE) #define _TIF_NOHZ (1 << TIF_NOHZ) +#define _TIF_SEA_NOTIFY (1 << TIF_SEA_NOTIFY) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SYSCALL_TRACEPOINT(1 << TIF_SYSCALL_TRACEPOINT) @@ -111,7 +113,7 @@ struct thread_info { #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ -
[PATCH v2 1/3] arm64/ras: support sea error recovery
With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors are consumed. In some cases, if the error address is in a clean page or a read-only page, there is a chance to recover. Such as error occurs in a instruction page, we can reread this page from disk instead of killing process. Because memory_failure() may sleep, we can not call it directly in SEA exception context. So we saved faulting physical address associated with a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return from SEA exception context and get into do_notify_resume() before the process running, we could check it and call memory_failure() to do recovery. It's safe, because we are in process context. Signed-off-by: Xie XiuQi Signed-off-by: Wang Xiongfeng --- arch/arm64/Kconfig | 11 +++ arch/arm64/include/asm/ras.h | 36 + arch/arm64/include/asm/thread_info.h | 4 +- arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/ras.c | 143 +++ arch/arm64/kernel/signal.c | 8 ++ arch/arm64/mm/fault.c| 27 +-- drivers/acpi/apei/ghes.c | 4 +- 8 files changed, 223 insertions(+), 11 deletions(-) create mode 100644 arch/arm64/include/asm/ras.h create mode 100644 arch/arm64/kernel/ras.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index dfd9086..7d44589 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -640,6 +640,17 @@ config HOTPLUG_CPU Say Y here to experiment with turning CPUs off and on. CPUs can be controlled through /sys/devices/system/cpu. +config ARM64_ERR_RECOV + bool "Support arm64 RAS error recovery" + depends on ACPI_APEI_SEA && MEMORY_FAILURE + help + With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors + are consumed. In some cases, if the error address is in a clean page or a + read-only page, there is a chance to recover. Such as error occurs in a + instruction page, we can reread this page from disk instead of killing process. + + Say Y if unsure. + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support" diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h new file mode 100644 index 000..e174f95 --- /dev/null +++ b/arch/arm64/include/asm/ras.h @@ -0,0 +1,36 @@ +/* + * ARM64 SEA error recoery support + * + * Copyright 2017 Huawei Technologies Co., Ltd. + * Author: Xie XiuQi + * Author: Wang Xiongfeng + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation; + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _ASM_RAS_H +#define _ASM_RAS_H + +#include +#include +#include + +extern void sea_notify_process(void); + +#ifdef CONFIG_ARM64_ERR_RECOV +extern void arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm *err); +#else +static inline void arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm *err) +{ + log_arm_hw_error(err); +} +#endif /* CONFIG_ARM64_ERR_RECOV */ + +#endif /*_ASM_RAS_H*/ diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 46c3b93..4b10131 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -86,6 +86,7 @@ struct thread_info { #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_FOREIGN_FPSTATE3 /* CPU's FP state is not current's */ #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ +#define TIF_SEA_NOTIFY 5 /* notify to do an error recovery */ #define TIF_NOHZ 7 #define TIF_SYSCALL_TRACE 8 #define TIF_SYSCALL_AUDIT 9 @@ -102,6 +103,7 @@ struct thread_info { #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE) #define _TIF_NOHZ (1 << TIF_NOHZ) +#define _TIF_SEA_NOTIFY (1 << TIF_SEA_NOTIFY) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SYSCALL_TRACEPOINT(1 << TIF_SYSCALL_TRACEPOINT) @@ -111,7 +113,7 @@ struct thread_info { #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ -_TIF_UPROBE) +_TIF_UPROBE|_TIF_SEA_NOTIFY) #
[PATCH v2 2/3] GHES: add a notify chain for process memory section
Add a notify chain for process memory section, with which other modules might do error recovery. Signed-off-by: Xie XiuQi <xiexi...@huawei.com> Tested-by: Wang Xiongfeng <wangxiongfe...@huawei.com> --- drivers/acpi/apei/ghes.c | 10 ++ include/acpi/ghes.h | 8 2 files changed, 18 insertions(+) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 502335c..5b310b7 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -108,6 +108,9 @@ static inline bool is_hest_type_generic_v2(struct ghes *ghes) static LIST_HEAD(ghes_hed); static DEFINE_MUTEX(ghes_list_mutex); +ATOMIC_NOTIFIER_HEAD(ghes_mem_err_chain); +EXPORT_SYMBOL(ghes_mem_err_chain); + /* * Because the memory area used to transfer hardware error information * from BIOS to Linux can be determined only in NMI, IRQ or timer @@ -479,6 +482,13 @@ static void ghes_do_proc(struct ghes *ghes, if (guid_equal(sec_type, _SEC_PLATFORM_MEM)) { struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); + struct ghes_mem_err mem; + + mem.notify_type = ghes->generic->notify.type; + mem.severity = gdata->error_severity; + mem.mem_err = mem_err; + + atomic_notifier_call_chain(_mem_err_chain, 0, ); ghes_edac_report_mem_error(ghes, sev, mem_err); diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h index 9f26e01..e26b783 100644 --- a/include/acpi/ghes.h +++ b/include/acpi/ghes.h @@ -115,4 +115,12 @@ static inline void *acpi_hest_get_next(struct acpi_hest_generic_data *gdata) int ghes_notify_sea(void); +struct ghes_mem_err { + int notify_type; + int severity; + struct cper_sec_mem_err *mem_err; +}; + +extern struct atomic_notifier_head ghes_mem_err_chain; + #endif /* GHES_H */ -- 1.8.3.1
[PATCH v2 3/3] arm64/ras: save error address from memory section for recovery
In some platform, when SEA triggerred, physical address might be reported by memory section, so we save it for error recovery later. Signed-off-by: Xie XiuQi <xiexi...@huawei.com> Tested-by: Wang Xiongfeng <wangxiongfe...@huawei.com> --- arch/arm64/kernel/ras.c | 31 +++ 1 file changed, 31 insertions(+) diff --git a/arch/arm64/kernel/ras.c b/arch/arm64/kernel/ras.c index 5710b2e..8a44c01 100644 --- a/arch/arm64/kernel/ras.c +++ b/arch/arm64/kernel/ras.c @@ -141,3 +141,34 @@ void arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm *err) set_thread_flag(TIF_SEA_NOTIFY); } +int ghes_mem_err_callback(struct notifier_block *nb, unsigned long val, void *data) +{ + bool info_saved = false; + struct ghes_mem_err *ghes_mem = (struct ghes_mem_err *)data; + struct cper_sec_mem_err *mem_err = ghes_mem->mem_err; + + if ((ghes_mem->notify_type != ACPI_HEST_NOTIFY_SEA) || + (ghes_mem->severity != CPER_SEV_RECOVERABLE)) + return 0; + + if (mem_err->validation_bits & CPER_MEM_VALID_PA) { + info_saved = sea_save_info(mem_err->physical_addr); + } + + if (info_saved) + set_thread_flag(TIF_SEA_NOTIFY); + + return 0; +} + +static struct notifier_block ghes_mem_err_nb = { + .notifier_call = ghes_mem_err_callback, +}; + +static int arm64_err_recov_init(void) +{ + atomic_notifier_chain_register(_mem_err_chain, _mem_err_nb); + return 0; +} + +late_initcall(arm64_err_recov_init); -- 1.8.3.1
[PATCH v2 2/3] GHES: add a notify chain for process memory section
Add a notify chain for process memory section, with which other modules might do error recovery. Signed-off-by: Xie XiuQi Tested-by: Wang Xiongfeng --- drivers/acpi/apei/ghes.c | 10 ++ include/acpi/ghes.h | 8 2 files changed, 18 insertions(+) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 502335c..5b310b7 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -108,6 +108,9 @@ static inline bool is_hest_type_generic_v2(struct ghes *ghes) static LIST_HEAD(ghes_hed); static DEFINE_MUTEX(ghes_list_mutex); +ATOMIC_NOTIFIER_HEAD(ghes_mem_err_chain); +EXPORT_SYMBOL(ghes_mem_err_chain); + /* * Because the memory area used to transfer hardware error information * from BIOS to Linux can be determined only in NMI, IRQ or timer @@ -479,6 +482,13 @@ static void ghes_do_proc(struct ghes *ghes, if (guid_equal(sec_type, _SEC_PLATFORM_MEM)) { struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); + struct ghes_mem_err mem; + + mem.notify_type = ghes->generic->notify.type; + mem.severity = gdata->error_severity; + mem.mem_err = mem_err; + + atomic_notifier_call_chain(_mem_err_chain, 0, ); ghes_edac_report_mem_error(ghes, sev, mem_err); diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h index 9f26e01..e26b783 100644 --- a/include/acpi/ghes.h +++ b/include/acpi/ghes.h @@ -115,4 +115,12 @@ static inline void *acpi_hest_get_next(struct acpi_hest_generic_data *gdata) int ghes_notify_sea(void); +struct ghes_mem_err { + int notify_type; + int severity; + struct cper_sec_mem_err *mem_err; +}; + +extern struct atomic_notifier_head ghes_mem_err_chain; + #endif /* GHES_H */ -- 1.8.3.1
[PATCH v2 3/3] arm64/ras: save error address from memory section for recovery
In some platform, when SEA triggerred, physical address might be reported by memory section, so we save it for error recovery later. Signed-off-by: Xie XiuQi Tested-by: Wang Xiongfeng --- arch/arm64/kernel/ras.c | 31 +++ 1 file changed, 31 insertions(+) diff --git a/arch/arm64/kernel/ras.c b/arch/arm64/kernel/ras.c index 5710b2e..8a44c01 100644 --- a/arch/arm64/kernel/ras.c +++ b/arch/arm64/kernel/ras.c @@ -141,3 +141,34 @@ void arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm *err) set_thread_flag(TIF_SEA_NOTIFY); } +int ghes_mem_err_callback(struct notifier_block *nb, unsigned long val, void *data) +{ + bool info_saved = false; + struct ghes_mem_err *ghes_mem = (struct ghes_mem_err *)data; + struct cper_sec_mem_err *mem_err = ghes_mem->mem_err; + + if ((ghes_mem->notify_type != ACPI_HEST_NOTIFY_SEA) || + (ghes_mem->severity != CPER_SEV_RECOVERABLE)) + return 0; + + if (mem_err->validation_bits & CPER_MEM_VALID_PA) { + info_saved = sea_save_info(mem_err->physical_addr); + } + + if (info_saved) + set_thread_flag(TIF_SEA_NOTIFY); + + return 0; +} + +static struct notifier_block ghes_mem_err_nb = { + .notifier_call = ghes_mem_err_callback, +}; + +static int arm64_err_recov_init(void) +{ + atomic_notifier_chain_register(_mem_err_chain, _mem_err_nb); + return 0; +} + +late_initcall(arm64_err_recov_init); -- 1.8.3.1