from:"Xie XiuQi"

[PATCH] sched,psi: fix typo in comment

2021-03-27 Thread Xie XiuQi

s/exceution/execution/
s/possibe/possible/
s/manupulations/manipulations/

Signed-off-by: Xie XiuQi 
---
 kernel/sched/psi.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 967732c0766c..7c800be47c6f 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -59,7 +59,7 @@
  * states, we would have to conclude a CPU SOME pressure number of
  * 100%, since *somebody* is waiting on a runqueue at all
  * times. However, that is clearly not the amount of contention the
- * workload is experiencing: only one out of 256 possible exceution
+ * workload is experiencing: only one out of 256 possible execution
  * threads will be contended at any given time, or about 0.4%.
  *
  * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any
@@ -73,7 +73,7 @@
  * we have to base our calculation on the number of non-idle tasks in
  * conjunction with the number of available CPUs, which is the number
  * of potential execution threads. SOME becomes then the proportion of
- * delayed tasks to possibe threads, and FULL is the share of possible
+ * delayed tasks to possible threads, and FULL is the share of possible
  * threads that are unproductive due to delays:
  *
  * threads = min(nr_nonidle_tasks, nr_cpus)
@@ -441,7 +441,8 @@ static void psi_avgs_work(struct work_struct *work)
mutex_unlock(>avgs_lock);
 }
 
-/* Trigger tracking window manupulations */
+/* Trigger tracking window manipulations */
+
 static void window_reset(struct psi_window *win, u64 now, u64 value,
 u64 prev_growth)
 {
-- 
2.25.1

[PATCH 0/2] soc/tegra: Fix build with Tegra234 configuration

2021-03-27 Thread Xie XiuQi

If only Tegra234 support is enabled, we got this build error:

 make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- -j128
  CALLscripts/atomic/check-atomics.sh
  CALLscripts/checksyscalls.sh
  CHK include/generated/compile.h
  CC  drivers/soc/tegra/fuse/fuse-tegra30.o
drivers/soc/tegra/fuse/fuse-tegra30.c:376:10: error: ‘tegra30_fuse_read’ 
undeclared here (not in a function); did you mean ‘tegra_fuse_readl’?
  .read = tegra30_fuse_read,
  ^
  tegra_fuse_readl
drivers/soc/tegra/fuse/fuse-tegra30.c:382:10: error: ‘tegra30_fuse_init’ 
undeclared here (not in a function); did you mean ‘tegra30_fuse_read’?
  .init = tegra30_fuse_init,
  ^
  tegra30_fuse_read
  CC  drivers/firmware/tegra/bpmp.o
  CHK kernel/kheaders_data.tar.xz
drivers/firmware/tegra/bpmp.c:861:51: error: ‘tegra186_soc’ undeclared here 
(not in a function); did you mean ‘tegra_ivc’?
  { .compatible = "nvidia,tegra186-bpmp", .data = _soc },
   ^~~~
   tegra_ivc

Xie XiuQi (2):
  soc/tegra: fuse: Fix build with Tegra234 configuration
  firmware: tegra: Fix build with Tegra234 configuration

 drivers/firmware/tegra/Makefile   | 1 +
 drivers/firmware/tegra/bpmp-private.h | 3 ++-
 drivers/firmware/tegra/bpmp.c | 3 ++-
 drivers/soc/tegra/fuse/fuse-tegra30.c | 3 ++-
 4 files changed, 7 insertions(+), 3 deletions(-)

-- 
2.25.1

[PATCH 2/2] firmware: tegra: Fix build with Tegra234 configuration

2021-03-27 Thread Xie XiuQi

If only Tegra234 support is enabled, the tegra186_bpmp_ops and
tegra186_soc are not defined and cause a build failure.

Add Tegra234 to the preprocessor guard to make sure these functions are
available for Tegra234-only builds as well.

Fixes: 0ebdf11699d0 ("firmware: tegra: Enable BPMP support on Tegra234")
Signed-off-by: Xie XiuQi 
---
 drivers/firmware/tegra/Makefile   | 1 +
 drivers/firmware/tegra/bpmp-private.h | 3 ++-
 drivers/firmware/tegra/bpmp.c | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/firmware/tegra/Makefile b/drivers/firmware/tegra/Makefile
index 49c87e00fafb..620cf3fdd607 100644
--- a/drivers/firmware/tegra/Makefile
+++ b/drivers/firmware/tegra/Makefile
@@ -3,6 +3,7 @@ tegra-bpmp-y= bpmp.o
 tegra-bpmp-$(CONFIG_ARCH_TEGRA_210_SOC)+= bpmp-tegra210.o
 tegra-bpmp-$(CONFIG_ARCH_TEGRA_186_SOC)+= bpmp-tegra186.o
 tegra-bpmp-$(CONFIG_ARCH_TEGRA_194_SOC)+= bpmp-tegra186.o
+tegra-bpmp-$(CONFIG_ARCH_TEGRA_234_SOC)+= bpmp-tegra186.o
 tegra-bpmp-$(CONFIG_DEBUG_FS)  += bpmp-debugfs.o
 obj-$(CONFIG_TEGRA_BPMP)   += tegra-bpmp.o
 obj-$(CONFIG_TEGRA_IVC)+= ivc.o
diff --git a/drivers/firmware/tegra/bpmp-private.h 
b/drivers/firmware/tegra/bpmp-private.h
index 54d560c48398..182bfe396516 100644
--- a/drivers/firmware/tegra/bpmp-private.h
+++ b/drivers/firmware/tegra/bpmp-private.h
@@ -24,7 +24,8 @@ struct tegra_bpmp_ops {
 };
 
 #if IS_ENABLED(CONFIG_ARCH_TEGRA_186_SOC) || \
-IS_ENABLED(CONFIG_ARCH_TEGRA_194_SOC)
+IS_ENABLED(CONFIG_ARCH_TEGRA_194_SOC) || \
+IS_ENABLED(CONFIG_ARCH_TEGRA_234_SOC)
 extern const struct tegra_bpmp_ops tegra186_bpmp_ops;
 #endif
 #if IS_ENABLED(CONFIG_ARCH_TEGRA_210_SOC)
diff --git a/drivers/firmware/tegra/bpmp.c b/drivers/firmware/tegra/bpmp.c
index 0742a90cb844..5654c5e9862b 100644
--- a/drivers/firmware/tegra/bpmp.c
+++ b/drivers/firmware/tegra/bpmp.c
@@ -809,7 +809,8 @@ static const struct dev_pm_ops tegra_bpmp_pm_ops = {
 };
 
 #if IS_ENABLED(CONFIG_ARCH_TEGRA_186_SOC) || \
-IS_ENABLED(CONFIG_ARCH_TEGRA_194_SOC)
+IS_ENABLED(CONFIG_ARCH_TEGRA_194_SOC) || \
+IS_ENABLED(CONFIG_ARCH_TEGRA_234_SOC)
 static const struct tegra_bpmp_soc tegra186_soc = {
.channels = {
.cpu_tx = {
-- 
2.25.1

[PATCH 1/2] soc/tegra: fuse: Fix build with Tegra234 configuration

2021-03-27 Thread Xie XiuQi

If only Tegra234 support is enabled, the tegra30_fuse_read() and
tegra30_fuse_init() function are not declared and cause a build failure.
Add Tegra234 to the preprocessor guard to make sure these functions are
available for Tegra234-only builds as well.

Fixes: 1f44febf71ba ("soc/tegra: fuse: Add Tegra234 support")
Signed-off-by: Xie XiuQi 
---
 drivers/soc/tegra/fuse/fuse-tegra30.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/soc/tegra/fuse/fuse-tegra30.c 
b/drivers/soc/tegra/fuse/fuse-tegra30.c
index 9ea7f0168457..c1aa7815bd6e 100644
--- a/drivers/soc/tegra/fuse/fuse-tegra30.c
+++ b/drivers/soc/tegra/fuse/fuse-tegra30.c
@@ -37,7 +37,8 @@
 defined(CONFIG_ARCH_TEGRA_132_SOC) || \
 defined(CONFIG_ARCH_TEGRA_210_SOC) || \
 defined(CONFIG_ARCH_TEGRA_186_SOC) || \
-defined(CONFIG_ARCH_TEGRA_194_SOC)
+defined(CONFIG_ARCH_TEGRA_194_SOC) || \
+defined(CONFIG_ARCH_TEGRA_234_SOC)
 static u32 tegra30_fuse_read_early(struct tegra_fuse *fuse, unsigned int 
offset)
 {
if (WARN_ON(!fuse->base))
-- 
2.25.1

[PATCH] scsi: qedi: remove unused variable udev & uctrl

2020-05-05 Thread Xie XiuQi

uctrl and udev are unused after commit 9632a6b4b747
("scsi: qedi: Move LL2 producer index processing in BH.")

Remove them.

Signed-off-by: Xie XiuQi 
---
 drivers/scsi/qedi/qedi_main.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c
index b995b19865ca..313f7e10aed9 100644
--- a/drivers/scsi/qedi/qedi_main.c
+++ b/drivers/scsi/qedi/qedi_main.c
@@ -658,8 +658,6 @@ static struct qedi_ctx *qedi_host_alloc(struct pci_dev 
*pdev)
 static int qedi_ll2_rx(void *cookie, struct sk_buff *skb, u32 arg1, u32 arg2)
 {
struct qedi_ctx *qedi = (struct qedi_ctx *)cookie;
-   struct qedi_uio_dev *udev;
-   struct qedi_uio_ctrl *uctrl;
struct skb_work_list *work;
struct ethhdr *eh;
 
@@ -698,9 +696,6 @@ static int qedi_ll2_rx(void *cookie, struct sk_buff *skb, 
u32 arg1, u32 arg2)
  "Allowed frame ethertype [0x%x] len [0x%x].\n",
  eh->h_proto, skb->len);
 
-   udev = qedi->udev;
-   uctrl = udev->uctrl;
-
work = kzalloc(sizeof(*work), GFP_ATOMIC);
if (!work) {
QEDI_WARN(>dbg_ctx,
-- 
2.20.1

[PATCH] Input: atmel_mxt_ts: remove meaningless checking

2020-05-05 Thread Xie XiuQi

byte_offset is unsigned integer, it's always >= 0, so remove this
meaningless checking.

Signed-off-by: Xie XiuQi 
---
 drivers/input/touchscreen/atmel_mxt_ts.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/input/touchscreen/atmel_mxt_ts.c 
b/drivers/input/touchscreen/atmel_mxt_ts.c
index ae60442efda0..1bd072c5a49d 100644
--- a/drivers/input/touchscreen/atmel_mxt_ts.c
+++ b/drivers/input/touchscreen/atmel_mxt_ts.c
@@ -1368,7 +1368,7 @@ static int mxt_prepare_cfg_mem(struct mxt_data *data, 
struct mxt_cfg *cfg)
 
byte_offset = reg + i - cfg->start_ofs;
 
-   if (byte_offset >= 0 && byte_offset < cfg->mem_size) {
+   if (byte_offset < cfg->mem_size) {
*(cfg->mem + byte_offset) = val;
} else {
dev_err(dev, "Bad object: reg:%d, T%d, 
ofs=%d\n",
-- 
2.20.1

[PATCH net-next] ixgbe: fix signed-integer-overflow warning

2020-05-04 Thread Xie XiuQi

ubsan report this warning, fix it by adding a unsigned suffix.

UBSAN: signed-integer-overflow in
drivers/net/ethernet/intel/ixgbe/ixgbe_common.c:2246:26
65535 * 65537 cannot be represented in type 'int'
CPU: 21 PID: 7 Comm: kworker/u256:0 Not tainted 5.7.0-rc3-debug+ #39
Hardware name: Huawei TaiShan 2280 V2/BC82AMDC, BIOS 2280-V2 03/27/2020
Workqueue: ixgbe ixgbe_service_task [ixgbe]
Call trace:
 dump_backtrace+0x0/0x3f0
 show_stack+0x28/0x38
 dump_stack+0x154/0x1e4
 ubsan_epilogue+0x18/0x60
 handle_overflow+0xf8/0x148
 __ubsan_handle_mul_overflow+0x34/0x48
 ixgbe_fc_enable_generic+0x4d0/0x590 [ixgbe]
 ixgbe_service_task+0xc20/0x1f78 [ixgbe]
 process_one_work+0x8f0/0xf18
 worker_thread+0x430/0x6d0
 kthread+0x218/0x238
 ret_from_fork+0x10/0x18

Reported-by: Hulk Robot 
Signed-off-by: Xie XiuQi 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
index 0bd1294ba517..39c5e6fdb72c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
@@ -2243,7 +2243,7 @@ s32 ixgbe_fc_enable_generic(struct ixgbe_hw *hw)
}
 
/* Configure pause time (2 TCs per register) */
-   reg = hw->fc.pause_time * 0x00010001;
+   reg = hw->fc.pause_time * 0x00010001U;
for (i = 0; i < (MAX_TRAFFIC_CLASS / 2); i++)
IXGBE_WRITE_REG(hw, IXGBE_FCTTV(i), reg);
 
-- 
2.20.1

[tip: sched/core] sched/debug: Fix trival print_task() format

2020-05-01 Thread tip-bot2 for Xie XiuQi

The following commit has been merged into the sched/core branch of tip:

Commit-ID: f080d93e1d419099a99d7473ed532289ca8dc717
Gitweb:
https://git.kernel.org/tip/f080d93e1d419099a99d7473ed532289ca8dc717
Author:Xie XiuQi 
AuthorDate:Tue, 14 Apr 2020 20:57:21 +08:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 30 Apr 2020 20:14:37 +02:00

sched/debug: Fix trival print_task() format

Ensure leave one space between state and task name.

w/o patch:
runnable tasks:
 S   task   PID tree-key  switches  prio wait
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20200414125721.195801-1-xiexi...@huawei.com
---
 kernel/sched/debug.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index a562df5..b3ac1c1 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -437,7 +437,7 @@ print_task(struct seq_file *m, struct rq *rq, struct 
task_struct *p)
else
SEQ_printf(m, " %c", task_state_to_char(p));
 
-   SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
+   SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime),
(long long)(p->nvcsw + p->nivcsw),
@@ -464,10 +464,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, 
int rq_cpu)
 
SEQ_printf(m, "\n");
SEQ_printf(m, "runnable tasks:\n");
-   SEQ_printf(m, " S   task   PID tree-key  switches  prio"
+   SEQ_printf(m, " Stask   PID tree-key  switches  
prio"
   " wait-time sum-execsum-sleep\n");
SEQ_printf(m, "---"
-  "\n");
+  "--\n");
 
rcu_read_lock();
for_each_process_thread(g, p) {

Re: [PATCH] arm64: panic on synchronous external abort in kernel context

2020-04-30 Thread Xie XiuQi

Hi James,

Sorry for reply late.

On 2020/4/14 22:53, James Morse wrote:
> Hi Xie,
> 
> On 14/04/2020 13:39, Xie XiuQi wrote:
>> On 2020/4/14 20:16, James Morse wrote:
>>> On 14/04/2020 11:59, Mark Rutland wrote:
>>>> On Fri, Apr 10, 2020 at 09:52:45AM +0800, Xie XiuQi wrote:
>>>>> We should panic even panic_on_oops is not set, when we can't recover
>>>>> from synchronous external abort in kernel context.
>>>
>>> Hmm, fault-from-kernel-context doesn't mean the fault affects the kernel. 
>>> If the kernel is
>>> reading or writing from user-space memory for a syscall, its the user-space 
>>> memory that is
>>> affected. This thread can't make progress, so we kill it.
>>> If its a kernel thread or we were in irq context, we panic().
>>>
>>> I don't think you really want all faults that happen as a result of a 
>>> kernel access to be
>>> fatal!
> 
>> Yes, you're right. I just want to fix a hung up when ras error inject 
>> testing.
>>
>> panic_on_oops is not set in the kernel image for testing. When receiving a 
>> sea in kernel
>> context, the PE trap in do_exit(), and can't return any more.
> 
> trap? gets trapped, (or gets stuck, to prevent confusion with the 
> architectures use of the
> word 'trap'!)
> 
> 
>> I analyze the source code, the call trace might like this:
>>do_mem_abort
>>  -> arm64_notify_die
>> -> die# kernel context, call die() directly;
>>-> do_exit # kernel process context, call 
>> do_exit(SIGSEGV);
>>   -> do_task_dead()   # call do_task_dead(), and hung up this 
>> core;
> 
> Thanks for the trace. This describes a corrected error in your I-cache, that 
> occurred
> while the kernel was executing a kernel thread. These shouldn't be fatal, 
> because it was
> corrected ... but the kernel doesn't know that because it doesn't know how to 
> parse those
> records.
> 
> There are two things wrong here:
> 1. it locks up while trying to kill the thread.
> 2. it tried to kill the thread in the first place!
> 
> For 1, does your l1l2_inject module take any spinlocks or tinker with the 
> pre-empt counter?
> 
> I suspect this is some rarely-tested path in do_task_dead() that sleeps, but 
> can't from
> your l1l2_inject module because the pre-empt counter has been raised.
> 
> CONFIG_DEBUG_ATOMIC_SLEEP may point at the function to blame.
> 
> It may be accessing some thread data that kthreads don't have, taking a 
> second exception
> and blocking on the die_lock. LOCKDEP should catch this one.
> 
> We should fix this one first.
> 

I analyze the l1l2_inject module, there is a kworker thread used to inject 
error.
do_sea() try to kill the kworker thread, so the work(s) queued on this kworker
is blocked.

panic_on_oops option is usually set on production environment, so if someone
unset this option for testing, he should take his own risk. Is it right?

> 
> 2 is a bit more complicated. Today, this is fatal because the arch code 
> assumes this was
> probably a memory error, and if it returns to user-space it can't avoid 
> getting stuck in a
> loop until the scheduled memory_failure() work runs. Instead it 
> unconditionally signals
> the process.
> 
> [0] fixes this up for memory errors. But in this case it will assume all the 
> work has been
> done by APEI, (or will be before we get back to user-space). APEI ignored the 
> processor
> error you fed it, because it doesn't know what they are, they are just 
> printed out.
> 
> This is fine for corrected errors, but were are reliant on your firmware 
> describing
> uncorrected errors with a 'fatal' severity... which might be too heavy a 
> hammer. (Ideally
> that would mean 'uncontained', and the kernel should handle, or detect it 
> can't, any other
> errror...)
> 
> We can discuss the right thing to do here when support for parsing these 'arm 
> processor
> errors' is posted.
> (If you think I need to do something different in [0] because of this, please 
> shout!)

For some errors which could be recoverable or corrected, do_sea() should not 
kill process
or die() unconditionally. It's better detect this situation.

> 
> 
>> [  387.740609] {1}[Hardware Error]: Hardware error from APEI Generic 
>> Hardware Error Source: 9
>> [  387.748837] {1}[Hardware Error]: event severity: recoverable
>> [  387.754470] {1}[Hardware Error]:  Error 0, type: recoverable
> 
>> [  387.760103] {1}[Hardware Error]:   section_type: ARM processor error
> 
> et voila! Case 2. Linux doesn't handl

Re: [PATCH] epoll: optimize epmutex in ep_free and eventpoll_release_file

2019-08-08 Thread Xie XiuQi

  * a lockdep warning.
>*/
> - mutex_lock(>mtx);
>   while ((rbp = rb_first_cached(>rbr)) != NULL) {
>   epi = rb_entry(rbp, struct epitem, rbn);
>   ep_remove(ep, epi);
> @@ -854,11 +885,19 @@ static void ep_free(struct eventpoll *ep)
>   }
>   mutex_unlock(>mtx);
>  
> - mutex_unlock();
> - mutex_destroy(>mtx);
> - free_uid(ep->user);
> - wakeup_source_unregister(ep->ws);
> - kfree(ep);
> + /*
> +  * ep will not been added to visited_list, because ep_ctrl()
> +  * can not get its reference and can not reference it by the
> +  * corresponding epitem. The only possible operation is list_del_init,
> +  * so it's OK to use list_empty_careful() here.
> +  */
> + if (!list_empty_careful(>visited_list_link)) {
> + mutex_lock();
> + list_del_init(>visited_list_link);
> + mutex_unlock();
> + }
> +
> + eventpoll_put_ep(ep);
>  }
>  
>  static int ep_eventpoll_release(struct inode *inode, struct file *file)
> @@ -985,7 +1024,7 @@ static const struct file_operations eventpoll_fops = {
>  void eventpoll_release_file(struct file *file)
>  {
>   struct eventpoll *ep;
> - struct epitem *epi, *next;
> + struct epitem *epi;
>  
>   /*
>* We don't want to get "file->f_lock" because it is not
> @@ -1000,14 +1039,51 @@ void eventpoll_release_file(struct file *file)
>*
>* Besides, ep_remove() acquires the lock, so we can't hold it here.
>*/
> - mutex_lock();
> - list_for_each_entry_safe(epi, next, >f_ep_links, fllink) {
> - ep = epi->ep;
> + rcu_read_lock();
> + while (true) {
> + epi = list_first_or_null_rcu(>f_ep_links,
> + struct epitem, fllink);
> + if (!epi)
> + break;
> +
> + ep = eventpoll_get_ep(epi->ep);
> + /* Current epi had been removed by ep_free() */
> + if (!ep)
> + continue;
> + rcu_read_unlock();
> +
>   mutex_lock_nested(>mtx, 0);
> - ep_remove(ep, epi);
> + /*
> +  * If rb_first_cached() returns NULL, it means that
> +  * the current epi had been removed by ep_free().
> +  * To prevent epi from double-freeing, check the
> +  * condition before invoking ep_remove().
> +  * If eventpoll_release_file() frees epi firstly,
> +  * the epi will not be freed again because the epi
> +  * must have been removed from ep->rbr when ep_free()
> +  * is invoked.
> +  */
> + if (rb_first_cached(>rbr))
> + ep_remove(ep, epi);
>   mutex_unlock(>mtx);
> +
> + eventpoll_put_ep(ep);
> +
> + rcu_read_lock();
> + }
> + rcu_read_unlock();
> +
> + /*
> +  * The file can not been added to tfile_check_list again, because
> +  * (1) refcnt has been zero, ep_ctrl() can no longer get its reference
> +  * (2) related ep items have been removed, ep_loop_check_proc() can not
> +  * get the file by ep->rbr.
> +  */
> + if (!list_empty_careful(>f_tfile_llink)) {
> + mutex_lock();
> + list_del_init(>f_tfile_llink);
> + mutex_unlock();
>   }
> - mutex_unlock();
>  }
>  
>  static int ep_alloc(struct eventpoll **pep)
> @@ -1030,6 +1106,8 @@ static int ep_alloc(struct eventpoll **pep)
>   ep->rbr = RB_ROOT_CACHED;
>   ep->ovflist = EP_UNACTIVE_PTR;
>   ep->user = user;
> + INIT_LIST_HEAD(>visited_list_link);
> + refcount_set(>ref, 1);
>  
>   *pep = ep;
>  
> @@ -2018,7 +2096,7 @@ static int ep_loop_check(struct eventpoll *ep, struct 
> file *file)
>   list_for_each_entry_safe(ep_cur, ep_next, _list,
>   visited_list_link) {
>   ep_cur->visited = 0;
> - list_del(_cur->visited_list_link);
> + list_del_init(_cur->visited_list_link);
>   }
>   return ret;
>  }
> 

-- 
Thanks,
Xie XiuQi

[tip:sched/urgent] sched/numa: Fix a possible divide-by-zero

2019-04-25 Thread tip-bot for Xie XiuQi

Commit-ID:  a860fa7b96e1a1c974556327aa1aee852d434c21
Gitweb: https://git.kernel.org/tip/a860fa7b96e1a1c974556327aa1aee852d434c21
Author: Xie XiuQi 
AuthorDate: Sat, 20 Apr 2019 16:34:16 +0800
Committer:  Ingo Molnar 
CommitDate: Thu, 25 Apr 2019 19:58:54 +0200

sched/numa: Fix a possible divide-by-zero

sched_clock_cpu() may not be consistent between CPUs. If a task
migrates to another CPU, then se.exec_start is set to that CPU's
rq_clock_task() by update_stats_curr_start(). Specifically, the new
value might be before the old value due to clock skew.

So then if in numa_get_avg_runtime() the expression:

  'now - p->last_task_numa_placement'

ends up as -1, then the divider '*period + 1' in task_numa_placement()
is 0 and things go bang. Similar to update_curr(), check if time goes
backwards to avoid this.

[ peterz: Wrote new changelog. ]
[ mingo: Tweaked the code comment. ]

Signed-off-by: Xie XiuQi 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: cj.chengj...@huawei.com
Cc: 
Link: 
http://lkml.kernel.org/r/20190425080016.gx11...@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a4d9e14bf138..35f3ea375084 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2007,6 +2007,10 @@ static u64 numa_get_avg_runtime(struct task_struct *p, 
u64 *period)
if (p->last_task_numa_placement) {
delta = runtime - p->last_sum_exec_runtime;
*period = now - p->last_task_numa_placement;
+
+   /* Avoid time going backwards, prevent potential divide error: 
*/
+   if (unlikely((s64)*period < 0))
+   *period = 0;
} else {
delta = p->se.avg.load_sum;
*period = LOAD_AVG_MAX;

[tip:sched/urgent] sched/numa: Fix a possible divide-by-zero

2019-04-25 Thread tip-bot for Xie XiuQi

Commit-ID:  993efef2483bde15fd33ec5fba5464597c2d8124
Gitweb: https://git.kernel.org/tip/993efef2483bde15fd33ec5fba5464597c2d8124
Author: Xie XiuQi 
AuthorDate: Sat, 20 Apr 2019 16:34:16 +0800
Committer:  Ingo Molnar 
CommitDate: Thu, 25 Apr 2019 19:58:35 +0200

sched/numa: Fix a possible divide-by-zero

sched_clock_cpu() may not be consistent between CPUs. If a task
migrates to another CPU, then se.exec_start is set to that CPU's
rq_clock_task() by update_stats_curr_start(). Specifically, the new
value might be before the old value due to clock skew.

So then if in numa_get_avg_runtime() the expression:

  'now - p->last_task_numa_placement'

ends up as -1, then the divider '*period + 1' in task_numa_placement()
is 0 and things go bang. Similar to update_curr(), check if time goes
backwards to avoid this.

[ peterz: Wrote new changelog. ]
[ mingo: Tweaked the code comment. ]

Signed-off-by: Xie XiuQi 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: cj.chengj...@huawei.com
Link: 
http://lkml.kernel.org/r/20190425080016.gx11...@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a4d9e14bf138..35f3ea375084 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2007,6 +2007,10 @@ static u64 numa_get_avg_runtime(struct task_struct *p, 
u64 *period)
if (p->last_task_numa_placement) {
delta = runtime - p->last_sum_exec_runtime;
*period = now - p->last_task_numa_placement;
+
+   /* Avoid time going backwards, prevent potential divide error: 
*/
+   if (unlikely((s64)*period < 0))
+   *period = 0;
} else {
delta = p->se.avg.load_sum;
*period = LOAD_AVG_MAX;

Re: [PATCH] sched: fix a potential divide error

2019-04-25 Thread Xie XiuQi

Hi Peter,

On 2019/4/25 16:00, Peter Zijlstra wrote:
> On Thu, Apr 25, 2019 at 11:52:28AM +0800, Xie XiuQi wrote:
>> On 2019/4/24 2:44, Peter Zijlstra wrote:
> 
>>> I'll try and come up with a better Changelog tomorrow.
> 
> I actually did, but forgot to send out. I have the below.
> Does that work for you?

It works for me, thank you very much.

> 
> ---
> Subject: sched/numa: Fix a possible divide-by-zero
> From: Xie XiuQi 
> Date: Sat, 20 Apr 2019 16:34:16 +0800
> 
> sched_clock_cpu() may not be consistent between CPUs. If a task
> migrates to another CPU, then se.exec_start is set to that CPU's
> rq_clock_task() by update_stats_curr_start(). Specifically, the new
> value might be before the old value due to clock skew.
> 
> So then if in numa_get_avg_runtime() the expression:
> 
>   'now - p->last_task_numa_placement'
> 
> ends up as -1, then the divider '*period + 1' in task_numa_placement()
> is 0 and things go bang. Similar to update_curr(), check if time goes
> backwards to avoid this.
> 
> Cc: mi...@redhat.com
> Cc: cj.chengj...@huawei.com
> Signed-off-by: Xie XiuQi 
> [peterz: simplified changelog]
> Signed-off-by: Peter Zijlstra (Intel) 
> Link: https://lkml.kernel.org/r/20190420083416.170446-1-xiexi...@huawei.com
> ---
>  kernel/sched/fair.c |4 
>  1 file changed, 4 insertions(+)
> 
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2007,6 +2007,10 @@ static u64 numa_get_avg_runtime(struct t
>   if (p->last_task_numa_placement) {
>   delta = runtime - p->last_sum_exec_runtime;
>   *period = now - p->last_task_numa_placement;
> +
> + /* Avoid backward, and prevent potential divide error */
> + if (unlikely((s64)*period < 0))
> + *period = 0;
>   } else {
>   delta = p->se.avg.load_sum;
>   *period = LOAD_AVG_MAX;
> 
> .
> 

-- 
Thanks,
Xie XiuQi

Re: [PATCH] sched: fix a potential divide error

2019-04-24 Thread Xie XiuQi

Hi Peter,

Thanks for your comments.

On 2019/4/24 2:44, Peter Zijlstra wrote:
> On Sat, Apr 20, 2019 at 04:34:16PM +0800, Xie XiuQi wrote:
>> We meet a divide error on 3.10.0 kernel, the error message is bellow:
> 
> That is a _realll_ old kernel. I would urge you to upgrade.

I will.

> 
>> [42.287996] divide error:  [#1] SMP
> 
>> sched_clock_cpu may not be consistent bwtwen cpus. If a task migrate
>> to another cpu, the se.exec_start was set to that cpu's rq_clock_task
>> by update_stats_curr_start(). Which may not be monotonic.
>>
>> update_stats_curr_start
>>   <- set_next_entity
>>  <- set_curr_task_fair
>> <- sched_move_task
> 
> That is not in fact a cross-cpu migration path. But I see the point.
> Also many migration paths do in fact preserve monotonicity, even when
> the clock is busted, but you're right, not all of them.
> 
>> So, if  now - p->last_task_numa_placement is -1, then (*period + 1) is
>> 0, and divide error was triggerred at the div operation:
>>   task_numa_placement:
>> runtime = numa_get_avg_runtime(p, );
>> f_weight = div64_u64(runtime << 16, period + 1);  // divide error here
>>
>> In this patch, we make sure period is not less than 0 to avoid this
>> divide error.
>>
>> Signed-off-by: Xie XiuQi 
>> Cc: sta...@vger.kernel.org
>> ---
>>  kernel/sched/fair.c | 4 
>>  1 file changed, 4 insertions(+)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 40bd1e27b1b7..f2abb258fc85 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -2007,6 +2007,10 @@ static u64 numa_get_avg_runtime(struct task_struct 
>> *p, u64 *period)
>>  if (p->last_task_numa_placement) {
>>  delta = runtime - p->last_sum_exec_runtime;
>>  *period = now - p->last_task_numa_placement;
>> +
>> +/* Avoid backward, and prevent potential divide error */
>> +if ((s64)*period < 0)
>> +*period = 0;
>>  } else {
>>  delta = p->se.avg.load_sum;
>>  *period = LOAD_AVG_MAX;
> 
> Yeah, I suppose that is indeed correct.
> 
> I'll try and come up with a better Changelog tomorrow.

Thanks.

-- 
Thanks,
Xie XiuQi

[PATCH] sched: fix a potential divide error

2019-04-20 Thread Xie XiuQi

We meet a divide error on 3.10.0 kernel, the error message is bellow:

[42.287996] divide error:  [#1] SMP
[42.297796] do nothing after die!
[42.299108] Modules linked in: signo_catch macvlan binfmt_misc
ip_set_hash_netport ip_set_hash_ipport vport_vxlan ipt_REJECT
xt_statistic xt_physdev xt_nat xt_recent xt_mark xt_comment ...
[42.312751] CPU: 8 PID: 23352 Comm: bash Tainted: G V---   3.10.0+ 
#1
[42.314308] Hardware name: OpenStack Foundation OpenStack Nova, BIOS
rel-1.9.1-0-gb3ef39f-20170329_185309-build9a64a246a231 04/01/2014
[42.317411] task: 880033fc9700 ti: 8807fed6 
task.ti:8807fed6
[42.318967] RIP: 0010:[]  [] 
task_numa_fault+0x1c2/0xbb0
[42.320515] RSP: :8807fed63d38  EFLAGS: 00010246
[42.322018] RAX: 002b7efd RBX: 880033fc9700 RCX:0003
[42.323563] RDX:  RSI: 0400 RDI:81a80f60
[42.325052] RBP: 8807fed63db8 R08: 81a80f68 R09:
[42.326531] R10: 88083ffda000 R11:  R12:0424
[42.327987] R13: 002b7efd R14:  R15:ea001ea42a00
[42.329420] FS:  7fa01a3b7740() 
GS:88103ec0()knlGS:
[42.330866] CS:  0010 DS:  ES:  CR0: 80050033
[42.332302] CR2: 00ff1fb0 CR3: 0007ff1d1000 CR4:003407e0
[42.333763] DR0:  DR1:  DR2:
[42.335187] DR3:  DR6: fffe0ff0 DR7:0400
[42.336595] Stack:
[42.337974]   0001bc9598a8 ea001ea42a00 
00010001
[42.339374]  00030001 0001 ea001ea42a00 
8807fed63db8
[42.340768]    bc9598a8 
0001
[42.342148] Call Trace:
[42.343494]  [] do_numa_page+0x162/0x1f0
[42.344831]  [] handle_mm_fault+0x627/0xf50
[42.346145]  [] __do_page_fault+0x166/0x470
[42.347442]  [] trace_do_page_fault+0x43/0x110
[42.348711]  [] do_async_page_fault+0x29/0xe0
[42.349948]  [] async_page_fault+0x28/0x30
[42.351149] Code: 00 3d 00 04 00 00 44 0f 4e d8 41 81 fb 00 04 00 00
0f 84 67 07 00 00 4c 89 e8 49 83 c6 01 31 d2 48 c1 e0 10 49 83 c4 01 45
31 c9 <49> f7 f6 48 c7 45 a8 00 00 00 00 48 c7 45 b0 00 00 00 00 49 89
[42.353707] RIP  [] task_numa_fault+0x1c2/0xbb0
[42.354927]  RSP 
[42.358114] ---[ end trace 4f2465cac18ff65e ]---
[42.359304] Kernel panic - not syncing: Fatal exception

sched_clock_cpu may not be consistent bwtwen cpus. If a task migrate
to another cpu, the se.exec_start was set to that cpu's rq_clock_task
by update_stats_curr_start(). Which may not be monotonic.

update_stats_curr_start
  <- set_next_entity
 <- set_curr_task_fair
<- sched_move_task

So, if  now - p->last_task_numa_placement is -1, then (*period + 1) is
0, and divide error was triggerred at the div operation:
  task_numa_placement:
runtime = numa_get_avg_runtime(p, );
f_weight = div64_u64(runtime << 16, period + 1);  // divide error here

In this patch, we make sure period is not less than 0 to avoid this
divide error.

Signed-off-by: Xie XiuQi 
Cc: sta...@vger.kernel.org
---
 kernel/sched/fair.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 40bd1e27b1b7..f2abb258fc85 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2007,6 +2007,10 @@ static u64 numa_get_avg_runtime(struct task_struct *p, 
u64 *period)
if (p->last_task_numa_placement) {
delta = runtime - p->last_sum_exec_runtime;
*period = now - p->last_task_numa_placement;
+
+   /* Avoid backward, and prevent potential divide error */
+   if ((s64)*period < 0)
+   *period = 0;
} else {
delta = p->se.avg.load_sum;
*period = LOAD_AVG_MAX;
-- 
2.20.1

Re: KASAN: use-after-free Write in _free_event

2019-02-16 Thread Xie XiuQi

ash yet.
> 
> Is there a chance of getting a reproducer for this one?
> 
> Thanks,
> --
> Alex
> 
> .
> 

-- 
Thanks,
Xie XiuQi
==
BUG: KASAN: use-after-free in atomic_dec_and_test 
include/asm-generic/atomic-instrumented.h:259 [inline]
BUG: KASAN: use-after-free in put_task_struct include/linux/sched/task.h:95 
[inline]
BUG: KASAN: use-after-free in _free_event+0x2c4/0xfe0 kernel/events/core.c:4452
Write of size 4 at addr 8883d8d80020 by task syz-executor0/28620

CPU: 1 PID: 28620 Comm: syz-executor0 Not tainted 4.19.18-514.55.6.9.x86_64+ #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 
04/01/2014
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0xca/0x13e lib/dump_stack.c:113
 print_address_description+0x79/0x330 mm/kasan/report.c:256
 kasan_report_error mm/kasan/report.c:354 [inline]
 kasan_report+0x18a/0x2e0 mm/kasan/report.c:412
 atomic_dec_and_test include/asm-generic/atomic-instrumented.h:259 [inline]
 put_task_struct include/linux/sched/task.h:95 [inline]
 _free_event+0x2c4/0xfe0 kernel/events/core.c:4452
 free_event+0x41/0xa0 kernel/events/core.c:4473
 perf_event_release_kernel+0x43b/0xc70 kernel/events/core.c:4634
 perf_release+0x33/0x40 kernel/events/core.c:4648
 __fput+0x27f/0x7f0 fs/file_table.c:278
 task_work_run+0x136/0x1b0 kernel/task_work.c:113
 tracehook_notify_resume include/linux/tracehook.h:193 [inline]
 exit_to_usermode_loop+0x1a7/0x1d0 arch/x86/entry/common.c:166
 prepare_exit_to_usermode arch/x86/entry/common.c:197 [inline]
 syscall_return_slowpath arch/x86/entry/common.c:268 [inline]
 do_syscall_64+0x461/0x580 arch/x86/entry/common.c:293
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4115f7
Code: 00 00 0f 05 48 3d 00 f0 ff ff 77 3f f3 c3 0f 1f 44 00 00 53 89 fb 48 83 
ec 10 e8 f4 fb ff ff 89 df 89 c2 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 2b 
89 d7 89 44 24 0c e8 36 fc ff ff 8b 44 24
RSP: 002b:7ffe24d0ba80 EFLAGS: 0293 ORIG_RAX: 0003
RAX:  RBX: 0005 RCX: 004115f7
RDX:  RSI: 00730f30 RDI: 0005
RBP: 7ffe24d0baec R08: 7ffe24d0b9b0 R09: 7ffe24d0b9b0
R10: 7ffe24d0b9c0 R11: 0293 R12: 0001
R13: 0001 R14:  R15: 

Allocated by task 28624:
 set_track mm/kasan/kasan.c:460 [inline]
 kasan_kmalloc+0xa0/0xd0 mm/kasan/kasan.c:553
 slab_post_alloc_hook mm/slab.h:444 [inline]
 slab_alloc_node mm/slub.c:2706 [inline]
 kmem_cache_alloc_node+0xe3/0x2e0 mm/slub.c:2742
 alloc_task_struct_node kernel/fork.c:157 [inline]
 dup_task_struct kernel/fork.c:802 [inline]
 copy_process.part.26+0x1a98/0x6c70 kernel/fork.c:1707
 copy_process kernel/fork.c:1664 [inline]
 _do_fork+0x1c6/0xe60 kernel/fork.c:2175
 do_syscall_64+0xc8/0x580 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 28624:
 set_track mm/kasan/kasan.c:460 [inline]
 __kasan_slab_free+0x12e/0x180 mm/kasan/kasan.c:521
 slab_free_hook mm/slub.c:1371 [inline]
 slab_free_freelist_hook mm/slub.c:1398 [inline]
 slab_free mm/slub.c:2953 [inline]
 kmem_cache_free+0xbd/0x320 mm/slub.c:2969
 copy_process.part.26+0x18d9/0x6c70 kernel/fork.c:2107
 copy_process kernel/fork.c:1664 [inline]
 _do_fork+0x1c6/0xe60 kernel/fork.c:2175
 do_syscall_64+0xc8/0x580 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

The buggy address belongs to the object at 8883d8d8
 which belongs to the cache task_struct of size 9992
The buggy address is located 32 bytes inside of
 9992-byte region [8883d8d8, 8883d8d82708)
The buggy address belongs to the page:
page:ea000f636000 count:1 mapcount:0 mapping:888107d07a00 index:0x0 
compound_mapcount: 0
flags: 0x2f80008100(slab|head)
raw: 002f80008100  00020001 888107d07a00
raw:  80030003 0001 
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 8883d8d7ff00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 8883d8d7ff80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>8883d8d8: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
   ^
 8883d8d80080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 8883d8d80100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb

BUG: KASAN: use-after-free in atomic_dec_and_test 
include/asm-generic/atomic-instrumented.h:259 [inline]
BUG: KASAN: use-after-free in put_task_struct include/linux/sched/task.h:95 
[inline]
BUG: KASAN: use-after-free in _free_event+0x2c4/0xfe0 kernel/events/core.c:4452
Write of size 4 at addr 8883780cd0a0 by task syz-executor0/9553

CPU: 3 PID: 9553 Comm: syz-executor0 Not tai

Re: [PATCH] sched: fix infinity loop in update_blocked_averages

2018-12-27 Thread Xie XiuQi

Hi Tejun,

On 2018/12/28 10:02, Tejun Heo wrote:
> On Thu, Dec 27, 2018 at 05:53:52PM -0800, Tejun Heo wrote:
>> Vincent knows that part way better than me but I think the safest way
>> would be doing the optimization removal iff tmp_alone_branch is
>> already pointing to leaf_cfs_rq_list.  IIUC, it's pointing to
>> something else only while a branch is being built and deferring
>> optimization removal by an avg update cycle isn't gonna make any
>> difference anyway.
> 
> So, something like the following.  Xie, can you see whether the
> following patch resolves the problem?

Zhipeng is preparing to test it, thanks.

> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index d1907506318a..88b9118b5191 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7698,7 +7698,8 @@ static void update_blocked_averages(int cpu)
>* There can be a lot of idle CPU cgroups.  Don't let fully
>* decayed cfs_rqs linger on the list.
>*/
> - if (cfs_rq_is_decayed(cfs_rq))
> + if (cfs_rq_is_decayed(cfs_rq) &&
> + rq->tmp_alone_branch == >leaf_cfs_rq_list)
>   list_del_leaf_cfs_rq(cfs_rq);
>  
>   /* Don't need periodic decay once load/util_avg are null */
> 
> .
> 

-- 
Thanks,
Xie XiuQi

[PATCH] sched: fix infinity loop in update_blocked_averages

2018-12-26 Thread Xie XiuQi

Zhepeng Xie report a bug, there is a infinity loop in
update_blocked_averages().

PID: 14233  TASK: 800b2de08fc0  CPU: 1   COMMAND: "docker"
 #0 [2213b9d0] update_blocked_averages at 0811e4a8
 #1 [2213ba60] pick_next_task_fair at 0812a3b4
 #2 [2213baf0] __schedule at 08deaa88
 #3 [2213bb70] schedule at 08deb1b8
 #4 [2213bb80] futex_wait_queue_me at 08180754
 #5 [2213bbd0] futex_wait at 0818192c
 #6 [2213bd00] do_futex at 08183ee4
 #7 [2213bde0] __arm64_sys_futex at 08184398
 #8 [2213be60] el0_svc_common at 080979ac
 #9 [2213bea0] el0_svc_handler at 08097a6c
 #10 [2213bff0] el0_svc at 08084044

rq->tmp_alone_branch introduced in 4.10, used to point to
the new beg of the list. If this cfs_rq is deleted somewhere
else, then the tmp_alone_branch will be illegal and cause
a list_add corruption.

(When enabled DEBUG_LIST, we fould this list_add corruption)

[ 2546.741103] list_add corruption. next->prev should be prev
(800b4d61ad40), but was 800ba434fa38. (next=800b6a95e740).
[ 2546.741130] [ cut here ]
[ 2546.741132] kernel BUG at lib/list_debug.c:25!
[ 2546.741136] Internal error: Oops - BUG: 0 [#1] SMP
[ 2546.742870] CPU: 1 PID: 29428 Comm: docker-runc Kdump: loaded Tainted: G E  
4.19.5-1.aarch64 #1
[ 2546.745415] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
[ 2546.747402] pstate: 4085 (nZcv daIf -PAN -UAO)
[ 2546.749015] pc : __list_add_valid+0x50/0x90
[ 2546.750485] lr : __list_add_valid+0x50/0x90
[ 2546.751975] sp : 1b5eb910
[ 2546.753286] x29: 1b5eb910 x28: 800abacf
[ 2546.754976] x27: 1b5ebbb0 x26: 0957
[ 2546.756665] x25: 0960d000 x24: 0250f41ca8f8
[ 2546.758366] x23: 800b6a95e740 x22: 800b4d61ad40
[ 2546.760066] x21: 800b4d61ad40 x20: 800ba434f080
[ 2546.761742] x19: 800b4d61ac00 x18: 
[ 2546.763425] x17:  x16: 
[ 2546.765089] x15: 09570748 x14: 662073617720
[ 2546.766755] x13: 747562202c293034 x12: 6461313664346230
[ 2546.768429] x11: 30382820 x10: 
[ 2546.770124] x9 : 0001 x8 : 09f34a0f
[ 2546.771831] x7 :  x6 : 250d
[ 2546.773525] x5 :  x4 : 
[ 2546.775227] x3 :  x2 : 70ef7f624013ca00
[ 2546.776929] x1 :  x0 : 0075
[ 2546.778623] Process docker-runc (pid: 29428, stack limit = 
0x293494a2)
[ 2546.780742] Call trace:
[ 2546.781955]  __list_add_valid+0x50/0x90
[ 2546.783469]  enqueue_entity+0x4a0/0x6e8
[ 2546.784957]  enqueue_task_fair+0xac/0x610
[ 2546.786502]  sched_move_task+0x134/0x178
[ 2546.787993]  cpu_cgroup_attach+0x40/0x78
[ 2546.789540]  cgroup_migrate_execute+0x378/0x3a8
[ 2546.791169]  cgroup_migrate+0x6c/0x90
[ 2546.792663]  cgroup_attach_task+0x148/0x238
[ 2546.794211]  __cgroup1_procs_write.isra.2+0xf8/0x160
[ 2546.795935]  cgroup1_procs_write+0x38/0x48
[ 2546.797492]  cgroup_file_write+0xa0/0x170
[ 2546.799010]  kernfs_fop_write+0x114/0x1e0
[ 2546.800558]  __vfs_write+0x60/0x190
[ 2546.801977]  vfs_write+0xac/0x1c0
[ 2546.803341]  ksys_write+0x6c/0xd8
[ 2546.804674]  __arm64_sys_write+0x24/0x30
[ 2546.806146]  el0_svc_common+0x78/0x100
[ 2546.807584]  el0_svc_handler+0x38/0x88
[ 2546.809017]  el0_svc+0x8/0xc

In this patch, we move rq->tmp_alone_branch point to its prev before delete it
from list.

Reported-by: Zhipeng Xie 
Cc: Bin Li 
Cc: [4.10+]
Fixes: 9c2791f936ef (sched/fair: Fix hierarchical order in rq->leaf_cfs_rq_list)
Signed-off-by: Xie XiuQi 
Tested-by: Zhipeng Xie 
---
 kernel/sched/fair.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ac855b2..7a72702 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -347,6 +347,11 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq 
*cfs_rq)
 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
if (cfs_rq->on_list) {
+   struct rq *rq = rq_of(cfs_rq);
+
+   if (rq->tmp_alone_branch == _rq->leaf_cfs_rq_list)
+   rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
+
list_del_rcu(_rq->leaf_cfs_rq_list);
cfs_rq->on_list = 0;
}
-- 
1.8.3.1

Re: [PATCH 1/2] arm64: avoid alloc memory on offline node

2018-06-19 Thread Xie XiuQi

Hi Lorenzo, Punit,


On 2018/6/20 0:32, Lorenzo Pieralisi wrote:
> On Tue, Jun 19, 2018 at 04:35:40PM +0100, Punit Agrawal wrote:
>> Michal Hocko  writes:
>>
>>> On Tue 19-06-18 15:54:26, Punit Agrawal wrote:
>>> [...]
>>>> In terms of $SUBJECT, I wonder if it's worth taking the original patch
>>>> as a temporary fix (it'll also be easier to backport) while we work on
>>>> fixing these other issues and enabling memoryless nodes.
>>>
>>> Well, x86 already does that but copying this antipatern is not really
>>> nice. So it is good as a quick fix but it would be definitely much
>>> better to have a robust fix. Who knows how many other places might hit
>>> this. You certainly do not want to add a hack like this all over...
>>
>> Completely agree! I was only suggesting it as a temporary measure,
>> especially as it looked like a proper fix might be invasive.
>>
>> Another fix might be to change the node specific allocation to node
>> agnostic allocations. It isn't clear why the allocation is being
>> requested from a specific node. I think Lorenzo suggested this in one of
>> the threads.
> 
> I think that code was just copypasted but it is better to fix the
> underlying issue.
> 
>> I've started putting together a set fixing the issues identified in this
>> thread. It should give a better idea on the best course of action.
> 
> On ACPI ARM64, this diff should do if I read the code correctly, it
> should be (famous last words) just a matter of mapping PXMs to nodes for
> every SRAT GICC entry, feel free to pick it up if it works.
> 
> Yes, we can take the original patch just because it is safer for an -rc
> cycle even though if the patch below would do delaying the fix for a
> couple of -rc (to get it tested across ACPI ARM64 NUMA platforms) is
> not a disaster.

I tested this patch on my arm board, it works.

-- 
Thanks,
Xie XiuQi

> 
> Lorenzo
> 
> -- >8 --
> diff --git a/arch/arm64/kernel/acpi_numa.c b/arch/arm64/kernel/acpi_numa.c
> index d190a7b231bf..877b268ef9fa 100644
> --- a/arch/arm64/kernel/acpi_numa.c
> +++ b/arch/arm64/kernel/acpi_numa.c
> @@ -70,12 +70,6 @@ void __init acpi_numa_gicc_affinity_init(struct 
> acpi_srat_gicc_affinity *pa)
>   if (!(pa->flags & ACPI_SRAT_GICC_ENABLED))
>   return;
>  
> - if (cpus_in_srat >= NR_CPUS) {
> - pr_warn_once("SRAT: cpu_to_node_map[%d] is too small, may not 
> be able to use all cpus\n",
> -  NR_CPUS);
> - return;
> - }
> -
>   pxm = pa->proximity_domain;
>   node = acpi_map_pxm_to_node(pxm);
>  
> @@ -85,6 +79,14 @@ void __init acpi_numa_gicc_affinity_init(struct 
> acpi_srat_gicc_affinity *pa)
>   return;
>   }
>  
> + node_set(node, numa_nodes_parsed);
> +
> + if (cpus_in_srat >= NR_CPUS) {
> + pr_warn_once("SRAT: cpu_to_node_map[%d] is too small, may not 
> be able to use all cpus\n",
> +  NR_CPUS);
> + return;
> + }
> +
>   mpidr = acpi_map_madt_entry(pa->acpi_processor_uid);
>   if (mpidr == PHYS_CPUID_INVALID) {
>   pr_err("SRAT: PXM %d with ACPI ID %d has no valid MPIDR in 
> MADT\n",
> @@ -95,7 +97,6 @@ void __init acpi_numa_gicc_affinity_init(struct 
> acpi_srat_gicc_affinity *pa)
>  
>   early_node_cpu_hwid[cpus_in_srat].node_id = node;
>   early_node_cpu_hwid[cpus_in_srat].cpu_hwid =  mpidr;
> - node_set(node, numa_nodes_parsed);
>   cpus_in_srat++;
>   pr_info("SRAT: PXM %d -> MPIDR 0x%Lx -> Node %d\n",
>   pxm, mpidr, node);
> 
> .
>

Re: [PATCH 1/2] arm64: avoid alloc memory on offline node

2018-06-19 Thread Xie XiuQi

Hi Lorenzo, Punit,


On 2018/6/20 0:32, Lorenzo Pieralisi wrote:
> On Tue, Jun 19, 2018 at 04:35:40PM +0100, Punit Agrawal wrote:
>> Michal Hocko  writes:
>>
>>> On Tue 19-06-18 15:54:26, Punit Agrawal wrote:
>>> [...]
>>>> In terms of $SUBJECT, I wonder if it's worth taking the original patch
>>>> as a temporary fix (it'll also be easier to backport) while we work on
>>>> fixing these other issues and enabling memoryless nodes.
>>>
>>> Well, x86 already does that but copying this antipatern is not really
>>> nice. So it is good as a quick fix but it would be definitely much
>>> better to have a robust fix. Who knows how many other places might hit
>>> this. You certainly do not want to add a hack like this all over...
>>
>> Completely agree! I was only suggesting it as a temporary measure,
>> especially as it looked like a proper fix might be invasive.
>>
>> Another fix might be to change the node specific allocation to node
>> agnostic allocations. It isn't clear why the allocation is being
>> requested from a specific node. I think Lorenzo suggested this in one of
>> the threads.
> 
> I think that code was just copypasted but it is better to fix the
> underlying issue.
> 
>> I've started putting together a set fixing the issues identified in this
>> thread. It should give a better idea on the best course of action.
> 
> On ACPI ARM64, this diff should do if I read the code correctly, it
> should be (famous last words) just a matter of mapping PXMs to nodes for
> every SRAT GICC entry, feel free to pick it up if it works.
> 
> Yes, we can take the original patch just because it is safer for an -rc
> cycle even though if the patch below would do delaying the fix for a
> couple of -rc (to get it tested across ACPI ARM64 NUMA platforms) is
> not a disaster.

I tested this patch on my arm board, it works.

-- 
Thanks,
Xie XiuQi

> 
> Lorenzo
> 
> -- >8 --
> diff --git a/arch/arm64/kernel/acpi_numa.c b/arch/arm64/kernel/acpi_numa.c
> index d190a7b231bf..877b268ef9fa 100644
> --- a/arch/arm64/kernel/acpi_numa.c
> +++ b/arch/arm64/kernel/acpi_numa.c
> @@ -70,12 +70,6 @@ void __init acpi_numa_gicc_affinity_init(struct 
> acpi_srat_gicc_affinity *pa)
>   if (!(pa->flags & ACPI_SRAT_GICC_ENABLED))
>   return;
>  
> - if (cpus_in_srat >= NR_CPUS) {
> - pr_warn_once("SRAT: cpu_to_node_map[%d] is too small, may not 
> be able to use all cpus\n",
> -  NR_CPUS);
> - return;
> - }
> -
>   pxm = pa->proximity_domain;
>   node = acpi_map_pxm_to_node(pxm);
>  
> @@ -85,6 +79,14 @@ void __init acpi_numa_gicc_affinity_init(struct 
> acpi_srat_gicc_affinity *pa)
>   return;
>   }
>  
> + node_set(node, numa_nodes_parsed);
> +
> + if (cpus_in_srat >= NR_CPUS) {
> + pr_warn_once("SRAT: cpu_to_node_map[%d] is too small, may not 
> be able to use all cpus\n",
> +  NR_CPUS);
> + return;
> + }
> +
>   mpidr = acpi_map_madt_entry(pa->acpi_processor_uid);
>   if (mpidr == PHYS_CPUID_INVALID) {
>   pr_err("SRAT: PXM %d with ACPI ID %d has no valid MPIDR in 
> MADT\n",
> @@ -95,7 +97,6 @@ void __init acpi_numa_gicc_affinity_init(struct 
> acpi_srat_gicc_affinity *pa)
>  
>   early_node_cpu_hwid[cpus_in_srat].node_id = node;
>   early_node_cpu_hwid[cpus_in_srat].cpu_hwid =  mpidr;
> - node_set(node, numa_nodes_parsed);
>   cpus_in_srat++;
>   pr_info("SRAT: PXM %d -> MPIDR 0x%Lx -> Node %d\n",
>   pxm, mpidr, node);
> 
> .
>

Re: [PATCH 1/2] arm64: avoid alloc memory on offline node

2018-06-19 Thread Xie XiuQi

Hi Michal,

On 2018/6/19 20:07, Michal Hocko wrote:
> On Tue 19-06-18 20:03:07, Xie XiuQi wrote:
> [...]
>> I tested on a arm board with 128 cores 4 numa nodes, but I set 
>> CONFIG_NR_CPUS=72.
>> Then node 3 is not be created, because node 3 has no memory, and no cpu.
>> But some pci device may related to node 3, which be set in ACPI table.
> 
> Could you double check that zonelists for node 3 are generated
> correctly?
> 

zonelists for node 3 is not created at all.

Kernel parse SRAT table to create node info, but in this case,
SRAT table is parsed not completed. Only the first 72 items are parsed.
In SRAT table, we haven't seen node 3 information yet, because 
cpu_to_node_map[72] is too small.

[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x3 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30001 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30002 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30003 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30100 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30101 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30102 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30103 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30200 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30201 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30202 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30203 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30300 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30301 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30302 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30303 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30400 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30401 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30402 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30403 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30500 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30501 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30502 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30503 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30600 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30601 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30602 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30603 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30700 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30701 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30702 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30703 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x1 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10001 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10002 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10003 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10100 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10101 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10102 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10103 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10200 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10201 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10202 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10203 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10300 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10301 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10302 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10303 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10400 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10401 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10402 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10403 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10500 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10501 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10502 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10503 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10600 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10601 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10602 -> Node 1

Re: [PATCH 1/2] arm64: avoid alloc memory on offline node

2018-06-19 Thread Xie XiuQi

Hi Michal,

On 2018/6/19 20:07, Michal Hocko wrote:
> On Tue 19-06-18 20:03:07, Xie XiuQi wrote:
> [...]
>> I tested on a arm board with 128 cores 4 numa nodes, but I set 
>> CONFIG_NR_CPUS=72.
>> Then node 3 is not be created, because node 3 has no memory, and no cpu.
>> But some pci device may related to node 3, which be set in ACPI table.
> 
> Could you double check that zonelists for node 3 are generated
> correctly?
> 

zonelists for node 3 is not created at all.

Kernel parse SRAT table to create node info, but in this case,
SRAT table is parsed not completed. Only the first 72 items are parsed.
In SRAT table, we haven't seen node 3 information yet, because 
cpu_to_node_map[72] is too small.

[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x3 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30001 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30002 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30003 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30100 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30101 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30102 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30103 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30200 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30201 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30202 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30203 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30300 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30301 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30302 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30303 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30400 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30401 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30402 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30403 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30500 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30501 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30502 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30503 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30600 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30601 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30602 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30603 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30700 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30701 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30702 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 0 -> MPIDR 0x30703 -> Node 0
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x1 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10001 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10002 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10003 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10100 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10101 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10102 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10103 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10200 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10201 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10202 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10203 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10300 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10301 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10302 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10303 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10400 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10401 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10402 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10403 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10500 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10501 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10502 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10503 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10600 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10601 -> Node 1
[0.00] ACPI: NUMA: SRAT: PXM 1 -> MPIDR 0x10602 -> Node 1

Re: [PATCH 1/2] arm64: avoid alloc memory on offline node

2018-06-19 Thread Xie XiuQi

Hi Punit,


On 2018/6/14 1:39, Punit Agrawal wrote:
> Punit Agrawal  writes:
> 
> 
> [...]
> 
>>
>> CONFIG_HAVE_MEMORYLESS node is not enabled on arm64 which means we end
>> up returning the original node in the fallback path.
>>
>> Xie, does the below patch help? I can submit a proper patch if this
>> fixes the issue for you.
>>
>> -- >8 --
>> Subject: [PATCH] arm64/numa: Enable memoryless numa nodes
>>
>> Signed-off-by: Punit Agrawal 
>> ---
>>  arch/arm64/Kconfig   | 4 
>>  arch/arm64/mm/numa.c | 2 ++
>>  2 files changed, 6 insertions(+)
>>
>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>> index eb2cf4938f6d..5317e9aa93ab 100644
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -756,6 +756,10 @@ config USE_PERCPU_NUMA_NODE_ID
>>  def_bool y
>>  depends on NUMA
>>  
>> +config HAVE_MEMORYLESS_NODES
>> +   def_bool y
>> +   depends on NUMA
>> +
>>  config HAVE_SETUP_PER_CPU_AREA
>>  def_bool y
>>  depends on NUMA
>> diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
>> index dad128ba98bf..c699dcfe93de 100644
>> --- a/arch/arm64/mm/numa.c
>> +++ b/arch/arm64/mm/numa.c
>> @@ -73,6 +73,8 @@ EXPORT_SYMBOL(cpumask_of_node);
>>  static void map_cpu_to_node(unsigned int cpu, int nid)
>>  {
>>  set_cpu_numa_node(cpu, nid);
>> +set_numa_mem(local_memory_node(nid));
> 
> Argh, this should be
> 
> set_cpu_numa_mem(cpu, local_memory_node(nid));
> 
> There is not guarantee that map_cpu_to_node() will be called on the
> local cpu.
> 
> Hanjun, Xie - can you try with the update please?

I've tested this patch, but it does not help.
The boot message is attached.

I tested on a arm board with 128 cores 4 numa nodes, but I set 
CONFIG_NR_CPUS=72.
Then node 3 is not be created, because node 3 has no memory, and no cpu.
But some pci device may related to node 3, which be set in ACPI table.

165 /* Interface called from ACPI code to setup PCI host controller */
166 struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
167 {
168 int node = acpi_get_node(root->device->handle);
169 struct acpi_pci_generic_root_info *ri;
170 struct pci_bus *bus, *child;
171 struct acpi_pci_root_ops *root_ops;
172
// this node may be not created.
177 ri = kzalloc_node(sizeof(*ri), GFP_KERNEL, node);
178 if (!ri)
179 return NULL;
180
181 root_ops = kzalloc_node(sizeof(*root_ops), GFP_KERNEL, node);
182 if (!root_ops) {
183 kfree(ri);
184 return NULL;
185 }
186
187 ri->cfg = pci_acpi_setup_ecam_mapping(root);
188 if (!ri->cfg) {
189 kfree(ri);
190 kfree(root_ops);
191 return NULL;
192 }


> 
> Thanks,
> Punit
> 
>> +
>>  if (nid >= 0)
>>  cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
>>  }
> 
> .
> 

-- 
Thanks,
Xie XiuQi
[0.00] Booting Linux on physical CPU 0x03 [0x480fd010]
[0.00] Linux version 4.16.0-rc1-00491-g204a6cc-dirty 
(xiexiuqi@localhost.localdomain) (gcc version 6.3.1 20170404 (Linaro GCC 
6.3-2017.05)) #17 SMP PREEMPT Tue Jun 19 16:33:32 CST 2018
[0.00] earlycon: pl11 at MMIO32 0x9408 (options '')
[0.00] bootconsole [pl11] enabled
[0.00] efi: Getting EFI parameters from FDT:
[0.00] efi: EFI v2.60 by EDK II
[0.00] efi:  SMBIOS 3.0=0x3eb6  ACPI 2.0=0x3971  
MEMATTR=0x3b106418 
[0.00] ACPI: Early table checksum verification disabled
[0.00] ACPI: RSDP 0x3971 24 (v02 HISI  )
[0.00] ACPI: XSDT 0x3970 74 (v01 HISI   HIP08
  0113)
[0.00] ACPI: FACP 0x3963 000114 (v06 HISI   HIP08
 INTL 20151124)
[0.00] ACPI: DSDT 0x395C 006A1A (v02 HISI   HIP08
 INTL 20170929)
[0.00] ACPI: GTDT 0x3962 60 (v02 HISI   HIP08
 INTL 20151124)
[0.00] ACPI: DBG2 0x3961 5A (v00 HISI   HIP08
 INTL 20151124)
[0.00] ACPI: MCFG 0x3960 3C (v01 HISI   HIP08
 INTL 20151124)
[0.00] ACPI: SLIT 0x395F 3C (v01 HISI   HIP07
 INTL 20151124)
[0.00] ACPI: SRAT 0x395E 0009C0 (v03 HISI   HIP08
 INTL 20151124)
[0.00] ACPI: APIC 0x395D 00286C (v04 HISI   HIP08
 INTL 20151124)
[0.00] ACPI: IORT 0x395B 00110C (v00 HISI   HIP08
 INTL 20170929)
[0.00] ACPI: PPTT 0x311F 003

Re: [PATCH 1/2] arm64: avoid alloc memory on offline node

2018-06-19 Thread Xie XiuQi

Hi Punit,


On 2018/6/14 1:39, Punit Agrawal wrote:
> Punit Agrawal  writes:
> 
> 
> [...]
> 
>>
>> CONFIG_HAVE_MEMORYLESS node is not enabled on arm64 which means we end
>> up returning the original node in the fallback path.
>>
>> Xie, does the below patch help? I can submit a proper patch if this
>> fixes the issue for you.
>>
>> -- >8 --
>> Subject: [PATCH] arm64/numa: Enable memoryless numa nodes
>>
>> Signed-off-by: Punit Agrawal 
>> ---
>>  arch/arm64/Kconfig   | 4 
>>  arch/arm64/mm/numa.c | 2 ++
>>  2 files changed, 6 insertions(+)
>>
>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>> index eb2cf4938f6d..5317e9aa93ab 100644
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -756,6 +756,10 @@ config USE_PERCPU_NUMA_NODE_ID
>>  def_bool y
>>  depends on NUMA
>>  
>> +config HAVE_MEMORYLESS_NODES
>> +   def_bool y
>> +   depends on NUMA
>> +
>>  config HAVE_SETUP_PER_CPU_AREA
>>  def_bool y
>>  depends on NUMA
>> diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
>> index dad128ba98bf..c699dcfe93de 100644
>> --- a/arch/arm64/mm/numa.c
>> +++ b/arch/arm64/mm/numa.c
>> @@ -73,6 +73,8 @@ EXPORT_SYMBOL(cpumask_of_node);
>>  static void map_cpu_to_node(unsigned int cpu, int nid)
>>  {
>>  set_cpu_numa_node(cpu, nid);
>> +set_numa_mem(local_memory_node(nid));
> 
> Argh, this should be
> 
> set_cpu_numa_mem(cpu, local_memory_node(nid));
> 
> There is not guarantee that map_cpu_to_node() will be called on the
> local cpu.
> 
> Hanjun, Xie - can you try with the update please?

I've tested this patch, but it does not help.
The boot message is attached.

I tested on a arm board with 128 cores 4 numa nodes, but I set 
CONFIG_NR_CPUS=72.
Then node 3 is not be created, because node 3 has no memory, and no cpu.
But some pci device may related to node 3, which be set in ACPI table.

165 /* Interface called from ACPI code to setup PCI host controller */
166 struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
167 {
168 int node = acpi_get_node(root->device->handle);
169 struct acpi_pci_generic_root_info *ri;
170 struct pci_bus *bus, *child;
171 struct acpi_pci_root_ops *root_ops;
172
// this node may be not created.
177 ri = kzalloc_node(sizeof(*ri), GFP_KERNEL, node);
178 if (!ri)
179 return NULL;
180
181 root_ops = kzalloc_node(sizeof(*root_ops), GFP_KERNEL, node);
182 if (!root_ops) {
183 kfree(ri);
184 return NULL;
185 }
186
187 ri->cfg = pci_acpi_setup_ecam_mapping(root);
188 if (!ri->cfg) {
189 kfree(ri);
190 kfree(root_ops);
191 return NULL;
192 }


> 
> Thanks,
> Punit
> 
>> +
>>  if (nid >= 0)
>>  cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
>>  }
> 
> .
> 

-- 
Thanks,
Xie XiuQi
[0.00] Booting Linux on physical CPU 0x03 [0x480fd010]
[0.00] Linux version 4.16.0-rc1-00491-g204a6cc-dirty 
(xiexiuqi@localhost.localdomain) (gcc version 6.3.1 20170404 (Linaro GCC 
6.3-2017.05)) #17 SMP PREEMPT Tue Jun 19 16:33:32 CST 2018
[0.00] earlycon: pl11 at MMIO32 0x9408 (options '')
[0.00] bootconsole [pl11] enabled
[0.00] efi: Getting EFI parameters from FDT:
[0.00] efi: EFI v2.60 by EDK II
[0.00] efi:  SMBIOS 3.0=0x3eb6  ACPI 2.0=0x3971  
MEMATTR=0x3b106418 
[0.00] ACPI: Early table checksum verification disabled
[0.00] ACPI: RSDP 0x3971 24 (v02 HISI  )
[0.00] ACPI: XSDT 0x3970 74 (v01 HISI   HIP08
  0113)
[0.00] ACPI: FACP 0x3963 000114 (v06 HISI   HIP08
 INTL 20151124)
[0.00] ACPI: DSDT 0x395C 006A1A (v02 HISI   HIP08
 INTL 20170929)
[0.00] ACPI: GTDT 0x3962 60 (v02 HISI   HIP08
 INTL 20151124)
[0.00] ACPI: DBG2 0x3961 5A (v00 HISI   HIP08
 INTL 20151124)
[0.00] ACPI: MCFG 0x3960 3C (v01 HISI   HIP08
 INTL 20151124)
[0.00] ACPI: SLIT 0x395F 3C (v01 HISI   HIP07
 INTL 20151124)
[0.00] ACPI: SRAT 0x395E 0009C0 (v03 HISI   HIP08
 INTL 20151124)
[0.00] ACPI: APIC 0x395D 00286C (v04 HISI   HIP08
 INTL 20151124)
[0.00] ACPI: IORT 0x395B 00110C (v00 HISI   HIP08
 INTL 20170929)
[0.00] ACPI: PPTT 0x311F 003

Re: [PATCH 1/2] arm64: avoid alloc memory on offline node

2018-06-11 Thread Xie XiuQi

Hi Michal,

On 2018/6/11 16:52, Michal Hocko wrote:
> On Mon 11-06-18 11:23:18, Xie XiuQi wrote:
>> Hi Michal,
>>
>> On 2018/6/7 20:21, Michal Hocko wrote:
>>> On Thu 07-06-18 19:55:53, Hanjun Guo wrote:
>>>> On 2018/6/7 18:55, Michal Hocko wrote:
>>> [...]
>>>>> I am not sure I have the full context but pci_acpi_scan_root calls
>>>>> kzalloc_node(sizeof(*info), GFP_KERNEL, node)
>>>>> and that should fall back to whatever node that is online. Offline node
>>>>> shouldn't keep any pages behind. So there must be something else going
>>>>> on here and the patch is not the right way to handle it. What does
>>>>> faddr2line __alloc_pages_nodemask+0xf0 tells on this kernel?
>>>>
>>>> The whole context is:
>>>>
>>>> The system is booted with a NUMA node has no memory attaching to it
>>>> (memory-less NUMA node), also with NR_CPUS less than CPUs presented
>>>> in MADT, so CPUs on this memory-less node are not brought up, and
>>>> this NUMA node will not be online (but SRAT presents this NUMA node);
>>>>
>>>> Devices attaching to this NUMA node such as PCI host bridge still
>>>> return the valid NUMA node via _PXM, but actually that valid NUMA node
>>>> is not online which lead to this issue.
>>>
>>> But we should have other numa nodes on the zonelists so the allocator
>>> should fall back to other node. If the zonelist is not intiailized
>>> properly, though, then this can indeed show up as a problem. Knowing
>>> which exact place has blown up would help get a better picture...
>>>
>>
>> I specific a non-exist node to allocate memory using kzalloc_node,
>> and got this following error message.
>>
>> And I found out there is just a VM_WARN, but it does not prevent the memory
>> allocation continue.
>>
>> This nid would be use to access NODE_DADA(nid), so if nid is invalid,
>> it would cause oops here.
>>
>> 459 /*
>> 460  * Allocate pages, preferring the node given as nid. The node must be 
>> valid and
>> 461  * online. For more general interface, see alloc_pages_node().
>> 462  */
>> 463 static inline struct page *
>> 464 __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
>> 465 {
>> 466 VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
>> 467 VM_WARN_ON(!node_online(nid));
>> 468
>> 469 return __alloc_pages(gfp_mask, order, nid);
>> 470 }
>> 471
>>
>> (I wrote a ko, to allocate memory on a non-exist node using kzalloc_node().)
> 
> OK, so this is an artificialy broken code, right. You shouldn't get a
> non-existent node via standard APIs AFAICS. The original report was
> about an existing node which is offline AFAIU. That would be a different
> case. If I am missing something and there are legitimate users that try
> to allocate from non-existing nodes then we should handle that in
> node_zonelist.

I think hanjun's comments may help to understood this question:
 - NUMA node will be built if CPUs and (or) memory are valid on this NUMA node;

 - But if we boot the system with memory-less node and also with CONFIG_NR_CPUS
   less than CPUs in SRAT, for example, 64 CPUs total with 4 NUMA nodes, 16 CPUs
   on each NUMA node, if we boot with CONFIG_NR_CPUS=48, then we will not built
   numa node for node 3, but with devices on that numa node, alloc memory will
   be panic because NUMA node 3 is not a valid node.

I triggered this BUG on arm64 platform, and I found a similar bug has been fixed
on x86 platform. So I sent a similar patch for this bug.

Or, could we consider to fix it in the mm subsystem?

>From b755de8dfdfef97effaa91379ffafcb81f4d62a1 Mon Sep 17 00:00:00 2001
From: Yinghai Lu 
Date: Wed, 20 Feb 2008 12:41:52 -0800
Subject: [PATCH] x86: make dev_to_node return online node

a numa system (with multi HT chains) may return node without ram. Aka it
is not online. Try to get an online node, otherwise return -1.

Signed-off-by: Yinghai Lu 
Signed-off-by: Ingo Molnar 
Signed-off-by: Thomas Gleixner 
---
 arch/x86/pci/acpi.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index d95de2f..ea8685f 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -172,6 +172,9 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct 
acpi_device *device, int do
set_mp_bus_to_node(busnum, node);
else
node = get_mp_bus_to_node(busnum);
+
+   if (node != -1 && !node_online(node))
+   node = -1;
 #endif

/* Allocate per-root-bus (not per bus) arch-specific data.
-- 
1.8.3.1


> 
> [...]
> 

-- 
Thanks,
Xie XiuQi

Re: [PATCH 1/2] arm64: avoid alloc memory on offline node

2018-06-11 Thread Xie XiuQi

Hi Michal,

On 2018/6/11 16:52, Michal Hocko wrote:
> On Mon 11-06-18 11:23:18, Xie XiuQi wrote:
>> Hi Michal,
>>
>> On 2018/6/7 20:21, Michal Hocko wrote:
>>> On Thu 07-06-18 19:55:53, Hanjun Guo wrote:
>>>> On 2018/6/7 18:55, Michal Hocko wrote:
>>> [...]
>>>>> I am not sure I have the full context but pci_acpi_scan_root calls
>>>>> kzalloc_node(sizeof(*info), GFP_KERNEL, node)
>>>>> and that should fall back to whatever node that is online. Offline node
>>>>> shouldn't keep any pages behind. So there must be something else going
>>>>> on here and the patch is not the right way to handle it. What does
>>>>> faddr2line __alloc_pages_nodemask+0xf0 tells on this kernel?
>>>>
>>>> The whole context is:
>>>>
>>>> The system is booted with a NUMA node has no memory attaching to it
>>>> (memory-less NUMA node), also with NR_CPUS less than CPUs presented
>>>> in MADT, so CPUs on this memory-less node are not brought up, and
>>>> this NUMA node will not be online (but SRAT presents this NUMA node);
>>>>
>>>> Devices attaching to this NUMA node such as PCI host bridge still
>>>> return the valid NUMA node via _PXM, but actually that valid NUMA node
>>>> is not online which lead to this issue.
>>>
>>> But we should have other numa nodes on the zonelists so the allocator
>>> should fall back to other node. If the zonelist is not intiailized
>>> properly, though, then this can indeed show up as a problem. Knowing
>>> which exact place has blown up would help get a better picture...
>>>
>>
>> I specific a non-exist node to allocate memory using kzalloc_node,
>> and got this following error message.
>>
>> And I found out there is just a VM_WARN, but it does not prevent the memory
>> allocation continue.
>>
>> This nid would be use to access NODE_DADA(nid), so if nid is invalid,
>> it would cause oops here.
>>
>> 459 /*
>> 460  * Allocate pages, preferring the node given as nid. The node must be 
>> valid and
>> 461  * online. For more general interface, see alloc_pages_node().
>> 462  */
>> 463 static inline struct page *
>> 464 __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
>> 465 {
>> 466 VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
>> 467 VM_WARN_ON(!node_online(nid));
>> 468
>> 469 return __alloc_pages(gfp_mask, order, nid);
>> 470 }
>> 471
>>
>> (I wrote a ko, to allocate memory on a non-exist node using kzalloc_node().)
> 
> OK, so this is an artificialy broken code, right. You shouldn't get a
> non-existent node via standard APIs AFAICS. The original report was
> about an existing node which is offline AFAIU. That would be a different
> case. If I am missing something and there are legitimate users that try
> to allocate from non-existing nodes then we should handle that in
> node_zonelist.

I think hanjun's comments may help to understood this question:
 - NUMA node will be built if CPUs and (or) memory are valid on this NUMA node;

 - But if we boot the system with memory-less node and also with CONFIG_NR_CPUS
   less than CPUs in SRAT, for example, 64 CPUs total with 4 NUMA nodes, 16 CPUs
   on each NUMA node, if we boot with CONFIG_NR_CPUS=48, then we will not built
   numa node for node 3, but with devices on that numa node, alloc memory will
   be panic because NUMA node 3 is not a valid node.

I triggered this BUG on arm64 platform, and I found a similar bug has been fixed
on x86 platform. So I sent a similar patch for this bug.

Or, could we consider to fix it in the mm subsystem?

>From b755de8dfdfef97effaa91379ffafcb81f4d62a1 Mon Sep 17 00:00:00 2001
From: Yinghai Lu 
Date: Wed, 20 Feb 2008 12:41:52 -0800
Subject: [PATCH] x86: make dev_to_node return online node

a numa system (with multi HT chains) may return node without ram. Aka it
is not online. Try to get an online node, otherwise return -1.

Signed-off-by: Yinghai Lu 
Signed-off-by: Ingo Molnar 
Signed-off-by: Thomas Gleixner 
---
 arch/x86/pci/acpi.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index d95de2f..ea8685f 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -172,6 +172,9 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct 
acpi_device *device, int do
set_mp_bus_to_node(busnum, node);
else
node = get_mp_bus_to_node(busnum);
+
+   if (node != -1 && !node_online(node))
+   node = -1;
 #endif

/* Allocate per-root-bus (not per bus) arch-specific data.
-- 
1.8.3.1


> 
> [...]
> 

-- 
Thanks,
Xie XiuQi

Re: [PATCH 1/2] arm64: avoid alloc memory on offline node

2018-06-10 Thread Xie XiuQi

: 0041a2d8 R08:  R09: 7ffd7d73f8b8
[  120.737019] R10: 0003 R11: 0206 R12: 
[  120.744123] R13: 00c20130 R14:  R15: 
[  120.751230] Code: 89 c6 74 0d e8 55 ab 5e 00 8b 74 24 1c 48 8b 3c 24 48 8b 
54 24 08 89 d9 c1 e9 17 83 e1 01 48 85 d2 88 4c 24 20 0f 85 25 01 00 00 <3b> 77 
08 0f 82 1c 01 00 00 48 89 f8 44 89 ea 48 89 e1 44 89 e6
[  120.770020] RIP: __alloc_pages_nodemask+0x10d/0x2c0 RSP: 881196947a90
[  120.776780] CR2: 2088
[  120.780116] ---[ end trace 89f801c36550734f ]---
[  120.978922] Kernel panic - not syncing: Fatal exception
[  120.984186] Kernel Offset: 0x3380 from 0xffff8100 (relocation 
range: 0x8000-0xbfff)
[  121.209501] ---[ end Kernel panic - not syncing: Fatal exception ]---



-- 
Thanks,
Xie XiuQi

Re: [PATCH 1/2] arm64: avoid alloc memory on offline node

2018-06-10 Thread Xie XiuQi

: 0041a2d8 R08:  R09: 7ffd7d73f8b8
[  120.737019] R10: 0003 R11: 0206 R12: 
[  120.744123] R13: 00c20130 R14:  R15: 
[  120.751230] Code: 89 c6 74 0d e8 55 ab 5e 00 8b 74 24 1c 48 8b 3c 24 48 8b 
54 24 08 89 d9 c1 e9 17 83 e1 01 48 85 d2 88 4c 24 20 0f 85 25 01 00 00 <3b> 77 
08 0f 82 1c 01 00 00 48 89 f8 44 89 ea 48 89 e1 44 89 e6
[  120.770020] RIP: __alloc_pages_nodemask+0x10d/0x2c0 RSP: 881196947a90
[  120.776780] CR2: 2088
[  120.780116] ---[ end trace 89f801c36550734f ]---
[  120.978922] Kernel panic - not syncing: Fatal exception
[  120.984186] Kernel Offset: 0x3380 from 0xffff8100 (relocation 
range: 0x8000-0xbfff)
[  121.209501] ---[ end Kernel panic - not syncing: Fatal exception ]---



-- 
Thanks,
Xie XiuQi

[PATCH 2/2] drivers: check numa node's online status in dev_to_node

2018-05-31 Thread Xie XiuQi

If dev->numa_node is not available (or offline), we should
return NUMA_NO_NODE to prevent alloc memory on offline
nodes, which could cause oops.

For example, a numa node:
1) without memory
2) NR_CPUS is very small, and the cpus on the node are not brought up

[   27.851041] Unable to handle kernel NULL pointer dereference at virtual 
address 1988
[   27.859128] Mem abort info:
[   27.861908]   ESR = 0x9605
[   27.864949]   Exception class = DABT (current EL), IL = 32 bits
[   27.870860]   SET = 0, FnV = 0
[   27.873900]   EA = 0, S1PTW = 0
[   27.877029] Data abort info:
[   27.879895]   ISV = 0, ISS = 0x0005
[   27.883716]   CM = 0, WnR = 0
[   27.886673] [1988] user address but active_mm is swapper
[   27.893012] Internal error: Oops: 9605 [#1] SMP
[   27.897876] Modules linked in:
[   27.900919] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 4.17.0-rc6-mpam+ #116
[   27.907865] Hardware name: Huawei D06/D06, BIOS Hisilicon D06 EC UEFI Nemo 
2.0 RC0 - B306 05/28/2018
[   27.916983] pstate: 80c9 (Nzcv daif +PAN +UAO)
[   27.921763] pc : __alloc_pages_nodemask+0xf0/0xe70
[   27.926540] lr : __alloc_pages_nodemask+0x184/0xe70
[   27.931403] sp : 0996f7e0
[   27.934704] x29: 0996f7e0 x28: 08cb10a0
[   27.940003] x27: 014012c0 x26: 
[   27.945301] x25: 0003 x24: 085bbc14
[   27.950600] x23: 0040 x22: 
[   27.955898] x21: 0001 x20: 
[   27.961196] x19: 0040 x18: 0f00
[   27.966494] x17: 003bff88 x16: 0020
[   27.971792] x15: 003b x14: 
[   27.977090] x13:  x12: 0030
[   27.982388] x11: 0101010101010101 x10: 7f7f7f7f7f7f7f7f
[   27.987686] x9 : 2e64716e622e7364 x8 : 7f7f7f7f7f7f7f7f
[   27.992984] x7 :  x6 : 08d73c08
[   27.998282] x5 :  x4 : 0081
[   28.003580] x3 :  x2 : 
[   28.008878] x1 : 0001 x0 : 1980
[   28.014177] Process swapper/0 (pid: 1, stack limit = 0x(ptrval))
[   28.020863] Call trace:
[   28.023296]  __alloc_pages_nodemask+0xf0/0xe70
[   28.027727]  allocate_slab+0x94/0x590
[   28.031374]  new_slab+0x68/0xc8
[   28.034502]  ___slab_alloc+0x444/0x4f8
[   28.038237]  __slab_alloc+0x50/0x68
[   28.041713]  __kmalloc_node_track_caller+0x100/0x320
[   28.046664]  devm_kmalloc+0x3c/0x90
[   28.050139]  pinctrl_bind_pins+0x4c/0x298
[   28.054135]  driver_probe_device+0xb4/0x4a0
[   28.058305]  __driver_attach+0x124/0x128
[   28.062213]  bus_for_each_dev+0x78/0xe0
[   28.066035]  driver_attach+0x30/0x40
[   28.069597]  bus_add_driver+0x248/0x2b8
[   28.073419]  driver_register+0x68/0x100
[   28.077242]  __pci_register_driver+0x64/0x78
[   28.081500]  pcie_portdrv_init+0x44/0x4c
[   28.085410]  do_one_initcall+0x54/0x208
[   28.089232]  kernel_init_freeable+0x244/0x340
[   28.093577]  kernel_init+0x18/0x118
[   28.097052]  ret_from_fork+0x10/0x1c
[   28.100614] Code: 7100047f 321902a4 1a950095 b5000602 (b9400803)
[   28.106740] ---[ end trace e32df44e6e1c3a4b ]---

Signed-off-by: Xie XiuQi 
Tested-by: Huiqiang Wang 
Cc: Hanjun Guo 
Cc: Tomasz Nowicki 
Cc: Xishi Qiu 
---
 include/linux/device.h | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/linux/device.h b/include/linux/device.h
index 4779569..2a4fb08 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1017,7 +1017,12 @@ extern __printf(2, 3)
 #ifdef CONFIG_NUMA
 static inline int dev_to_node(struct device *dev)
 {
-   return dev->numa_node;
+   int node = dev->numa_node;
+
+   if (unlikely(node != NUMA_NO_NODE && !node_online(node)))
+   return NUMA_NO_NODE;
+
+   return node;
 }
 static inline void set_dev_node(struct device *dev, int node)
 {
-- 
1.8.3.1

[PATCH 0/2] arm64/drivers: avoid alloc memory on offline node

2018-05-31 Thread Xie XiuQi

A numa system may return node which is not online.
For example, a numa node:
1) without memory
2) NR_CPUS is very small, and the cpus on the node are not brought up

In this situation, we use NUMA_NO_NODE to avoid oops.

[   25.732905] Unable to handle kernel NULL pointer dereference at virtual 
address 1988
[   25.740982] Mem abort info:
[   25.743762]   ESR = 0x9605
[   25.746803]   Exception class = DABT (current EL), IL = 32 bits
[   25.752711]   SET = 0, FnV = 0
[   25.755751]   EA = 0, S1PTW = 0
[   25.758878] Data abort info:
[   25.761745]   ISV = 0, ISS = 0x0005
[   25.765568]   CM = 0, WnR = 0
[   25.768521] [1988] user address but active_mm is swapper
[   25.774861] Internal error: Oops: 9605 [#1] SMP
[   25.779724] Modules linked in:
[   25.782768] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 4.17.0-rc6-mpam+ #115
[   25.789714] Hardware name: Huawei D06/D06, BIOS Hisilicon D06 EC UEFI Nemo 
2.0 RC0 - B305 05/28/2018
[   25.798831] pstate: 80c9 (Nzcv daif +PAN +UAO)
[   25.803612] pc : __alloc_pages_nodemask+0xf0/0xe70
[   25.808389] lr : __alloc_pages_nodemask+0x184/0xe70
[   25.813252] sp : 0996f660
[   25.816553] x29: 0996f660 x28: 
[   25.821852] x27: 014012c0 x26: 
[   25.827150] x25: 0003 x24: 08099eac
[   25.832449] x23: 0040 x22: 
[   25.837747] x21: 0001 x20: 
[   25.843045] x19: 0040 x18: 00010e00
[   25.848343] x17: 0437f790 x16: 0020
[   25.853641] x15:  x14: 6549435020524541
[   25.858939] x13: 20454d502067756c x12: 
[   25.864237] x11: 0996f6f0 x10: 0006
[   25.869536] x9 : 12a4 x8 : 8023c000ff90
[   25.874834] x7 :  x6 : 08d73c08
[   25.880132] x5 :  x4 : 0081
[   25.885430] x3 :  x2 : 
[   25.890728] x1 : 0001 x0 : 1980
[   25.896027] Process swapper/0 (pid: 1, stack limit = 0x(ptrval))
[   25.902712] Call trace:
[   25.905146]  __alloc_pages_nodemask+0xf0/0xe70
[   25.909577]  allocate_slab+0x94/0x590
[   25.913225]  new_slab+0x68/0xc8
[   25.916353]  ___slab_alloc+0x444/0x4f8
[   25.920088]  __slab_alloc+0x50/0x68
[   25.923562]  kmem_cache_alloc_node_trace+0xe8/0x230
[   25.928426]  pci_acpi_scan_root+0x94/0x278
[   25.932510]  acpi_pci_root_add+0x228/0x4b0
[   25.936593]  acpi_bus_attach+0x10c/0x218
[   25.940501]  acpi_bus_attach+0xac/0x218
[   25.944323]  acpi_bus_attach+0xac/0x218
[   25.948144]  acpi_bus_scan+0x5c/0xc0
[   25.951708]  acpi_scan_init+0xf8/0x254
[   25.955443]  acpi_init+0x310/0x37c
[   25.958831]  do_one_initcall+0x54/0x208
[   25.962653]  kernel_init_freeable+0x244/0x340
[   25.966999]  kernel_init+0x18/0x118
[   25.970474]  ret_from_fork+0x10/0x1c
[   25.974036] Code: 7100047f 321902a4 1a950095 b5000602 (b9400803)
[   25.980162] ---[ end trace 64f0893eb21ec283 ]---
[   25.984765] Kernel panic - not syncing: Fatal exception

Xie XiuQi (2):
  arm64: avoid alloc memory on offline node
  drivers: check numa node's online status in dev_to_node

 arch/arm64/kernel/pci.c | 3 +++
 include/linux/device.h  | 7 ++-
 2 files changed, 9 insertions(+), 1 deletion(-)

-- 
1.8.3.1

[PATCH 0/2] arm64/drivers: avoid alloc memory on offline node

2018-05-31 Thread Xie XiuQi

A numa system may return node which is not online.
For example, a numa node:
1) without memory
2) NR_CPUS is very small, and the cpus on the node are not brought up

In this situation, we use NUMA_NO_NODE to avoid oops.

[   25.732905] Unable to handle kernel NULL pointer dereference at virtual 
address 1988
[   25.740982] Mem abort info:
[   25.743762]   ESR = 0x9605
[   25.746803]   Exception class = DABT (current EL), IL = 32 bits
[   25.752711]   SET = 0, FnV = 0
[   25.755751]   EA = 0, S1PTW = 0
[   25.758878] Data abort info:
[   25.761745]   ISV = 0, ISS = 0x0005
[   25.765568]   CM = 0, WnR = 0
[   25.768521] [1988] user address but active_mm is swapper
[   25.774861] Internal error: Oops: 9605 [#1] SMP
[   25.779724] Modules linked in:
[   25.782768] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 4.17.0-rc6-mpam+ #115
[   25.789714] Hardware name: Huawei D06/D06, BIOS Hisilicon D06 EC UEFI Nemo 
2.0 RC0 - B305 05/28/2018
[   25.798831] pstate: 80c9 (Nzcv daif +PAN +UAO)
[   25.803612] pc : __alloc_pages_nodemask+0xf0/0xe70
[   25.808389] lr : __alloc_pages_nodemask+0x184/0xe70
[   25.813252] sp : 0996f660
[   25.816553] x29: 0996f660 x28: 
[   25.821852] x27: 014012c0 x26: 
[   25.827150] x25: 0003 x24: 08099eac
[   25.832449] x23: 0040 x22: 
[   25.837747] x21: 0001 x20: 
[   25.843045] x19: 0040 x18: 00010e00
[   25.848343] x17: 0437f790 x16: 0020
[   25.853641] x15:  x14: 6549435020524541
[   25.858939] x13: 20454d502067756c x12: 
[   25.864237] x11: 0996f6f0 x10: 0006
[   25.869536] x9 : 12a4 x8 : 8023c000ff90
[   25.874834] x7 :  x6 : 08d73c08
[   25.880132] x5 :  x4 : 0081
[   25.885430] x3 :  x2 : 
[   25.890728] x1 : 0001 x0 : 1980
[   25.896027] Process swapper/0 (pid: 1, stack limit = 0x(ptrval))
[   25.902712] Call trace:
[   25.905146]  __alloc_pages_nodemask+0xf0/0xe70
[   25.909577]  allocate_slab+0x94/0x590
[   25.913225]  new_slab+0x68/0xc8
[   25.916353]  ___slab_alloc+0x444/0x4f8
[   25.920088]  __slab_alloc+0x50/0x68
[   25.923562]  kmem_cache_alloc_node_trace+0xe8/0x230
[   25.928426]  pci_acpi_scan_root+0x94/0x278
[   25.932510]  acpi_pci_root_add+0x228/0x4b0
[   25.936593]  acpi_bus_attach+0x10c/0x218
[   25.940501]  acpi_bus_attach+0xac/0x218
[   25.944323]  acpi_bus_attach+0xac/0x218
[   25.948144]  acpi_bus_scan+0x5c/0xc0
[   25.951708]  acpi_scan_init+0xf8/0x254
[   25.955443]  acpi_init+0x310/0x37c
[   25.958831]  do_one_initcall+0x54/0x208
[   25.962653]  kernel_init_freeable+0x244/0x340
[   25.966999]  kernel_init+0x18/0x118
[   25.970474]  ret_from_fork+0x10/0x1c
[   25.974036] Code: 7100047f 321902a4 1a950095 b5000602 (b9400803)
[   25.980162] ---[ end trace 64f0893eb21ec283 ]---
[   25.984765] Kernel panic - not syncing: Fatal exception

Xie XiuQi (2):
  arm64: avoid alloc memory on offline node
  drivers: check numa node's online status in dev_to_node

 arch/arm64/kernel/pci.c | 3 +++
 include/linux/device.h  | 7 ++-
 2 files changed, 9 insertions(+), 1 deletion(-)

-- 
1.8.3.1

[PATCH 2/2] drivers: check numa node's online status in dev_to_node

2018-05-31 Thread Xie XiuQi

If dev->numa_node is not available (or offline), we should
return NUMA_NO_NODE to prevent alloc memory on offline
nodes, which could cause oops.

For example, a numa node:
1) without memory
2) NR_CPUS is very small, and the cpus on the node are not brought up

[   27.851041] Unable to handle kernel NULL pointer dereference at virtual 
address 1988
[   27.859128] Mem abort info:
[   27.861908]   ESR = 0x9605
[   27.864949]   Exception class = DABT (current EL), IL = 32 bits
[   27.870860]   SET = 0, FnV = 0
[   27.873900]   EA = 0, S1PTW = 0
[   27.877029] Data abort info:
[   27.879895]   ISV = 0, ISS = 0x0005
[   27.883716]   CM = 0, WnR = 0
[   27.886673] [1988] user address but active_mm is swapper
[   27.893012] Internal error: Oops: 9605 [#1] SMP
[   27.897876] Modules linked in:
[   27.900919] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 4.17.0-rc6-mpam+ #116
[   27.907865] Hardware name: Huawei D06/D06, BIOS Hisilicon D06 EC UEFI Nemo 
2.0 RC0 - B306 05/28/2018
[   27.916983] pstate: 80c9 (Nzcv daif +PAN +UAO)
[   27.921763] pc : __alloc_pages_nodemask+0xf0/0xe70
[   27.926540] lr : __alloc_pages_nodemask+0x184/0xe70
[   27.931403] sp : 0996f7e0
[   27.934704] x29: 0996f7e0 x28: 08cb10a0
[   27.940003] x27: 014012c0 x26: 
[   27.945301] x25: 0003 x24: 085bbc14
[   27.950600] x23: 0040 x22: 
[   27.955898] x21: 0001 x20: 
[   27.961196] x19: 0040 x18: 0f00
[   27.966494] x17: 003bff88 x16: 0020
[   27.971792] x15: 003b x14: 
[   27.977090] x13:  x12: 0030
[   27.982388] x11: 0101010101010101 x10: 7f7f7f7f7f7f7f7f
[   27.987686] x9 : 2e64716e622e7364 x8 : 7f7f7f7f7f7f7f7f
[   27.992984] x7 :  x6 : 08d73c08
[   27.998282] x5 :  x4 : 0081
[   28.003580] x3 :  x2 : 
[   28.008878] x1 : 0001 x0 : 1980
[   28.014177] Process swapper/0 (pid: 1, stack limit = 0x(ptrval))
[   28.020863] Call trace:
[   28.023296]  __alloc_pages_nodemask+0xf0/0xe70
[   28.027727]  allocate_slab+0x94/0x590
[   28.031374]  new_slab+0x68/0xc8
[   28.034502]  ___slab_alloc+0x444/0x4f8
[   28.038237]  __slab_alloc+0x50/0x68
[   28.041713]  __kmalloc_node_track_caller+0x100/0x320
[   28.046664]  devm_kmalloc+0x3c/0x90
[   28.050139]  pinctrl_bind_pins+0x4c/0x298
[   28.054135]  driver_probe_device+0xb4/0x4a0
[   28.058305]  __driver_attach+0x124/0x128
[   28.062213]  bus_for_each_dev+0x78/0xe0
[   28.066035]  driver_attach+0x30/0x40
[   28.069597]  bus_add_driver+0x248/0x2b8
[   28.073419]  driver_register+0x68/0x100
[   28.077242]  __pci_register_driver+0x64/0x78
[   28.081500]  pcie_portdrv_init+0x44/0x4c
[   28.085410]  do_one_initcall+0x54/0x208
[   28.089232]  kernel_init_freeable+0x244/0x340
[   28.093577]  kernel_init+0x18/0x118
[   28.097052]  ret_from_fork+0x10/0x1c
[   28.100614] Code: 7100047f 321902a4 1a950095 b5000602 (b9400803)
[   28.106740] ---[ end trace e32df44e6e1c3a4b ]---

Signed-off-by: Xie XiuQi 
Tested-by: Huiqiang Wang 
Cc: Hanjun Guo 
Cc: Tomasz Nowicki 
Cc: Xishi Qiu 
---
 include/linux/device.h | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/linux/device.h b/include/linux/device.h
index 4779569..2a4fb08 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1017,7 +1017,12 @@ extern __printf(2, 3)
 #ifdef CONFIG_NUMA
 static inline int dev_to_node(struct device *dev)
 {
-   return dev->numa_node;
+   int node = dev->numa_node;
+
+   if (unlikely(node != NUMA_NO_NODE && !node_online(node)))
+   return NUMA_NO_NODE;
+
+   return node;
 }
 static inline void set_dev_node(struct device *dev, int node)
 {
-- 
1.8.3.1

[PATCH 1/2] arm64: avoid alloc memory on offline node

2018-05-31 Thread Xie XiuQi

A numa system may return node which is not online.
For example, a numa node:
1) without memory
2) NR_CPUS is very small, and the cpus on the node are not brought up

In this situation, we use NUMA_NO_NODE to avoid oops.

[   25.732905] Unable to handle kernel NULL pointer dereference at virtual 
address 1988
[   25.740982] Mem abort info:
[   25.743762]   ESR = 0x9605
[   25.746803]   Exception class = DABT (current EL), IL = 32 bits
[   25.752711]   SET = 0, FnV = 0
[   25.755751]   EA = 0, S1PTW = 0
[   25.758878] Data abort info:
[   25.761745]   ISV = 0, ISS = 0x0005
[   25.765568]   CM = 0, WnR = 0
[   25.768521] [1988] user address but active_mm is swapper
[   25.774861] Internal error: Oops: 9605 [#1] SMP
[   25.779724] Modules linked in:
[   25.782768] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 4.17.0-rc6-mpam+ #115
[   25.789714] Hardware name: Huawei D06/D06, BIOS Hisilicon D06 EC UEFI Nemo 
2.0 RC0 - B305 05/28/2018
[   25.798831] pstate: 80c9 (Nzcv daif +PAN +UAO)
[   25.803612] pc : __alloc_pages_nodemask+0xf0/0xe70
[   25.808389] lr : __alloc_pages_nodemask+0x184/0xe70
[   25.813252] sp : 0996f660
[   25.816553] x29: 0996f660 x28: 
[   25.821852] x27: 014012c0 x26: 
[   25.827150] x25: 0003 x24: 08099eac
[   25.832449] x23: 0040 x22: 
[   25.837747] x21: 0001 x20: 
[   25.843045] x19: 0040 x18: 00010e00
[   25.848343] x17: 0437f790 x16: 0020
[   25.853641] x15:  x14: 6549435020524541
[   25.858939] x13: 20454d502067756c x12: 
[   25.864237] x11: 0996f6f0 x10: 0006
[   25.869536] x9 : 12a4 x8 : 8023c000ff90
[   25.874834] x7 :  x6 : 08d73c08
[   25.880132] x5 :  x4 : 0081
[   25.885430] x3 :  x2 : 
[   25.890728] x1 : 0001 x0 : 1980
[   25.896027] Process swapper/0 (pid: 1, stack limit = 0x(ptrval))
[   25.902712] Call trace:
[   25.905146]  __alloc_pages_nodemask+0xf0/0xe70
[   25.909577]  allocate_slab+0x94/0x590
[   25.913225]  new_slab+0x68/0xc8
[   25.916353]  ___slab_alloc+0x444/0x4f8
[   25.920088]  __slab_alloc+0x50/0x68
[   25.923562]  kmem_cache_alloc_node_trace+0xe8/0x230
[   25.928426]  pci_acpi_scan_root+0x94/0x278
[   25.932510]  acpi_pci_root_add+0x228/0x4b0
[   25.936593]  acpi_bus_attach+0x10c/0x218
[   25.940501]  acpi_bus_attach+0xac/0x218
[   25.944323]  acpi_bus_attach+0xac/0x218
[   25.948144]  acpi_bus_scan+0x5c/0xc0
[   25.951708]  acpi_scan_init+0xf8/0x254
[   25.955443]  acpi_init+0x310/0x37c
[   25.958831]  do_one_initcall+0x54/0x208
[   25.962653]  kernel_init_freeable+0x244/0x340
[   25.966999]  kernel_init+0x18/0x118
[   25.970474]  ret_from_fork+0x10/0x1c
[   25.974036] Code: 7100047f 321902a4 1a950095 b5000602 (b9400803)
[   25.980162] ---[ end trace 64f0893eb21ec283 ]---
[   25.984765] Kernel panic - not syncing: Fatal exception

Signed-off-by: Xie XiuQi 
Tested-by: Huiqiang Wang 
Cc: Hanjun Guo 
Cc: Tomasz Nowicki 
Cc: Xishi Qiu 
---
 arch/arm64/kernel/pci.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c
index 0e2ea1c..e17cc45 100644
--- a/arch/arm64/kernel/pci.c
+++ b/arch/arm64/kernel/pci.c
@@ -170,6 +170,9 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root 
*root)
struct pci_bus *bus, *child;
struct acpi_pci_root_ops *root_ops;
 
+   if (node != NUMA_NO_NODE && !node_online(node))
+   node = NUMA_NO_NODE;
+
ri = kzalloc_node(sizeof(*ri), GFP_KERNEL, node);
if (!ri)
return NULL;
-- 
1.8.3.1

[PATCH 1/2] arm64: avoid alloc memory on offline node

2018-05-31 Thread Xie XiuQi

A numa system may return node which is not online.
For example, a numa node:
1) without memory
2) NR_CPUS is very small, and the cpus on the node are not brought up

In this situation, we use NUMA_NO_NODE to avoid oops.

[   25.732905] Unable to handle kernel NULL pointer dereference at virtual 
address 1988
[   25.740982] Mem abort info:
[   25.743762]   ESR = 0x9605
[   25.746803]   Exception class = DABT (current EL), IL = 32 bits
[   25.752711]   SET = 0, FnV = 0
[   25.755751]   EA = 0, S1PTW = 0
[   25.758878] Data abort info:
[   25.761745]   ISV = 0, ISS = 0x0005
[   25.765568]   CM = 0, WnR = 0
[   25.768521] [1988] user address but active_mm is swapper
[   25.774861] Internal error: Oops: 9605 [#1] SMP
[   25.779724] Modules linked in:
[   25.782768] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 4.17.0-rc6-mpam+ #115
[   25.789714] Hardware name: Huawei D06/D06, BIOS Hisilicon D06 EC UEFI Nemo 
2.0 RC0 - B305 05/28/2018
[   25.798831] pstate: 80c9 (Nzcv daif +PAN +UAO)
[   25.803612] pc : __alloc_pages_nodemask+0xf0/0xe70
[   25.808389] lr : __alloc_pages_nodemask+0x184/0xe70
[   25.813252] sp : 0996f660
[   25.816553] x29: 0996f660 x28: 
[   25.821852] x27: 014012c0 x26: 
[   25.827150] x25: 0003 x24: 08099eac
[   25.832449] x23: 0040 x22: 
[   25.837747] x21: 0001 x20: 
[   25.843045] x19: 0040 x18: 00010e00
[   25.848343] x17: 0437f790 x16: 0020
[   25.853641] x15:  x14: 6549435020524541
[   25.858939] x13: 20454d502067756c x12: 
[   25.864237] x11: 0996f6f0 x10: 0006
[   25.869536] x9 : 12a4 x8 : 8023c000ff90
[   25.874834] x7 :  x6 : 08d73c08
[   25.880132] x5 :  x4 : 0081
[   25.885430] x3 :  x2 : 
[   25.890728] x1 : 0001 x0 : 1980
[   25.896027] Process swapper/0 (pid: 1, stack limit = 0x(ptrval))
[   25.902712] Call trace:
[   25.905146]  __alloc_pages_nodemask+0xf0/0xe70
[   25.909577]  allocate_slab+0x94/0x590
[   25.913225]  new_slab+0x68/0xc8
[   25.916353]  ___slab_alloc+0x444/0x4f8
[   25.920088]  __slab_alloc+0x50/0x68
[   25.923562]  kmem_cache_alloc_node_trace+0xe8/0x230
[   25.928426]  pci_acpi_scan_root+0x94/0x278
[   25.932510]  acpi_pci_root_add+0x228/0x4b0
[   25.936593]  acpi_bus_attach+0x10c/0x218
[   25.940501]  acpi_bus_attach+0xac/0x218
[   25.944323]  acpi_bus_attach+0xac/0x218
[   25.948144]  acpi_bus_scan+0x5c/0xc0
[   25.951708]  acpi_scan_init+0xf8/0x254
[   25.955443]  acpi_init+0x310/0x37c
[   25.958831]  do_one_initcall+0x54/0x208
[   25.962653]  kernel_init_freeable+0x244/0x340
[   25.966999]  kernel_init+0x18/0x118
[   25.970474]  ret_from_fork+0x10/0x1c
[   25.974036] Code: 7100047f 321902a4 1a950095 b5000602 (b9400803)
[   25.980162] ---[ end trace 64f0893eb21ec283 ]---
[   25.984765] Kernel panic - not syncing: Fatal exception

Signed-off-by: Xie XiuQi 
Tested-by: Huiqiang Wang 
Cc: Hanjun Guo 
Cc: Tomasz Nowicki 
Cc: Xishi Qiu 
---
 arch/arm64/kernel/pci.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c
index 0e2ea1c..e17cc45 100644
--- a/arch/arm64/kernel/pci.c
+++ b/arch/arm64/kernel/pci.c
@@ -170,6 +170,9 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root 
*root)
struct pci_bus *bus, *child;
struct acpi_pci_root_ops *root_ops;
 
+   if (node != NUMA_NO_NODE && !node_online(node))
+   node = NUMA_NO_NODE;
+
ri = kzalloc_node(sizeof(*ri), GFP_KERNEL, node);
if (!ri)
return NULL;
-- 
1.8.3.1

Re: [PATCH v5 2/3] GHES: add a notify chain for process memory section

2018-02-08 Thread Xie XiuQi

Hi Boris,

Thanks for your comments.

On 2018/2/7 18:31, Borislav Petkov wrote:
> On Fri, Jan 26, 2018 at 08:31:24PM +0800, Xie XiuQi wrote:
>> Add a notify chain for process memory section, with
>> which other modules might do error recovery.
>>
>> Signed-off-by: Xie XiuQi <xiexi...@huawei.com>
>> Tested-by: Wang Xiongfeng <wangxiongfe...@huawei.com>
>> Tested-by: Tyler Baicar <tbai...@codeaurora.org>
>> ---
>>  drivers/acpi/apei/ghes.c | 10 ++
>>  include/acpi/ghes.h  |  8 
>>  2 files changed, 18 insertions(+)
>>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>> index cff671d..1f0ebfb 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -109,6 +109,9 @@ static inline bool is_hest_type_generic_v2(struct ghes 
>> *ghes)
>>  static LIST_HEAD(ghes_hed);
>>  static DEFINE_MUTEX(ghes_list_mutex);
>>  
>> +ATOMIC_NOTIFIER_HEAD(ghes_mem_err_chain);
>> +EXPORT_SYMBOL(ghes_mem_err_chain);
> 
> Definitely not EXPORT_SYMBOL.
> 
> And certainly not export the notifier head. Have register and unregister
> functions instead.
> 
> That is, *if* you can't solve it differently with James. Notifiers
> should be avoided if possible.
> 

Yes, I saw the patchset, I agree. I will work with James to solve the problem.

-- 
Thanks,
Xie XiuQi

Re: [PATCH v5 2/3] GHES: add a notify chain for process memory section

2018-02-08 Thread Xie XiuQi

Hi Boris,

Thanks for your comments.

On 2018/2/7 18:31, Borislav Petkov wrote:
> On Fri, Jan 26, 2018 at 08:31:24PM +0800, Xie XiuQi wrote:
>> Add a notify chain for process memory section, with
>> which other modules might do error recovery.
>>
>> Signed-off-by: Xie XiuQi 
>> Tested-by: Wang Xiongfeng 
>> Tested-by: Tyler Baicar 
>> ---
>>  drivers/acpi/apei/ghes.c | 10 ++
>>  include/acpi/ghes.h  |  8 
>>  2 files changed, 18 insertions(+)
>>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>> index cff671d..1f0ebfb 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -109,6 +109,9 @@ static inline bool is_hest_type_generic_v2(struct ghes 
>> *ghes)
>>  static LIST_HEAD(ghes_hed);
>>  static DEFINE_MUTEX(ghes_list_mutex);
>>  
>> +ATOMIC_NOTIFIER_HEAD(ghes_mem_err_chain);
>> +EXPORT_SYMBOL(ghes_mem_err_chain);
> 
> Definitely not EXPORT_SYMBOL.
> 
> And certainly not export the notifier head. Have register and unregister
> functions instead.
> 
> That is, *if* you can't solve it differently with James. Notifiers
> should be avoided if possible.
> 

Yes, I saw the patchset, I agree. I will work with James to solve the problem.

-- 
Thanks,
Xie XiuQi

Re: [PATCH v5 1/3] arm64/ras: support sea error recovery

2018-02-08 Thread Xie XiuQi

Hi James,

Sorry for reply late.

On 2018/2/8 3:03, James Morse wrote:
> Hi Xie XiuQi,
> 
> On 30/01/18 19:19, James Morse wrote:
>> On 26/01/18 12:31, Xie XiuQi wrote:
>>> With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
>>> are consumed. According to the existing process, errors occurred in the
>>> kernel, leading to direct panic, if it occurred the user-space, we should
>>> just kill process.
>>>
>>> But there is a class of error, in fact, is not necessary to kill
>>> process, you can recover and continue to run the process. Such as
>>> the instruction data corrupted, where the memory page might be
>>> read-only, which is has not been modified, the disk might have the
>>> correct data, so you can directly drop the page, ant reload it when
>>> necessary.
>>
>> With firmware-first support, we do all this...
>>
>>
>>> So this patchset is just try to solve such problem: if the error is
>>> consumed in user-space and the error occurs on a clean page, you can
>>> directly drop the memory page without killing process.
>>>
>>> If the corrupted page is clean, just dropped it and return to user-space
>>> without side effects. And if corrupted page is dirty, memory_failure()
>>> will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset,
>>> do_sea() will just send SIGBUS, so the process was killed in the same place.
>>
>> ... but this happens too. I agree its something we should fix, but I don't 
>> think
>> this is the best way to do it.
>>
>> This series is pulling the memory-failure-queue details back into the 
>> arch-code
>> to build a second list, that gets processed as extra work when we return to
>> user-space.
>>
>>
>> The root of the issue is ghes_notify_sea() claims the notification as 
>> something
>> APEI has dealt with, ... but it hasn't done it yet. The signals will be
>> generated by something currently stuck in a queue. (Evidently x86 doesn't 
>> handle
>> synchronous errors like this using firmware-first).
>>
>> I think a smaller fix is to give the queues that may be holding the
>> memory_failure() work a kick as part of the code that calls 
>> ghes_notify_sea().
>> This means that by the time we return to do_sea() ghes_notify_sea()'s claim 
>> that
>> APEI has dealt with it is true as any generated signals are pending. We can 
>> then
>> skip the existing SIGBUS generation code.
>>
>>
>>> Because memory_failure() may sleep, we can not call it directly in SEA
>>
>> (this one is more serious, I've attempted to fix it by moving all NMI-like
>> GHES-notifications to use the estatus queue).
>>
>>
>>> exception context. So we saved faulting physical address associated with
>>> a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return
>>> from SEA exception context and get into do_notify_resume() before the
>>> process running, we could check it and call memory_failure() to do
>>> recovery.
>>
>>> It's safe, because we are in process context.
>>
>> I think this is the trick. When we take a Synchronous-external-abort out of
>> userspace, we're in process context too. We can add helpers to drain the
>> memory_failure_queue which can be called when do_sea() when we know we're
>> preemptible and interrupts-et-al are unmasked.
> 
> Something like... base on [0], in arch/arm64/kernel/acpi.c:

I am very glad that you are trying to solve the problem, which is very helpful.
I agree with your proposal, and I'll test it on by box latter.

Indeed, we're in precess context when we are in sea handler. I was thought we
can't call schedule() in the exception handler before.

Thank you very much!

> -%<-
> int apei_claim_sea(struct pt_regs *regs)
> {
> int cpu;
> int err = -ENOENT;
> unsigned long current_flags = arch_local_save_flags();
> unsigned long interrupted_flags = current_flags;
> 
> if (!IS_ENABLED(CONFIG_ACPI_APEI_SEA))
> return err;
> 
> if (regs)
> interrupted_flags = regs->pstate;
> 
> /*
>  * APEI expects an NMI-like notification to always be called
>  * in NMI context.
>  */
> local_daif_restore(DAIF_ERRCTX);
> nmi_enter();
> err = ghes_notify_sea();
> cpu = smp_processor_id();
> nmi_exit();
> 
> /*
>  * APEI NMI-like notifications are deferred to irq_work. Unless

Re: [PATCH v5 1/3] arm64/ras: support sea error recovery

2018-02-08 Thread Xie XiuQi

Hi James,

Sorry for reply late.

On 2018/2/8 3:03, James Morse wrote:
> Hi Xie XiuQi,
> 
> On 30/01/18 19:19, James Morse wrote:
>> On 26/01/18 12:31, Xie XiuQi wrote:
>>> With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
>>> are consumed. According to the existing process, errors occurred in the
>>> kernel, leading to direct panic, if it occurred the user-space, we should
>>> just kill process.
>>>
>>> But there is a class of error, in fact, is not necessary to kill
>>> process, you can recover and continue to run the process. Such as
>>> the instruction data corrupted, where the memory page might be
>>> read-only, which is has not been modified, the disk might have the
>>> correct data, so you can directly drop the page, ant reload it when
>>> necessary.
>>
>> With firmware-first support, we do all this...
>>
>>
>>> So this patchset is just try to solve such problem: if the error is
>>> consumed in user-space and the error occurs on a clean page, you can
>>> directly drop the memory page without killing process.
>>>
>>> If the corrupted page is clean, just dropped it and return to user-space
>>> without side effects. And if corrupted page is dirty, memory_failure()
>>> will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset,
>>> do_sea() will just send SIGBUS, so the process was killed in the same place.
>>
>> ... but this happens too. I agree its something we should fix, but I don't 
>> think
>> this is the best way to do it.
>>
>> This series is pulling the memory-failure-queue details back into the 
>> arch-code
>> to build a second list, that gets processed as extra work when we return to
>> user-space.
>>
>>
>> The root of the issue is ghes_notify_sea() claims the notification as 
>> something
>> APEI has dealt with, ... but it hasn't done it yet. The signals will be
>> generated by something currently stuck in a queue. (Evidently x86 doesn't 
>> handle
>> synchronous errors like this using firmware-first).
>>
>> I think a smaller fix is to give the queues that may be holding the
>> memory_failure() work a kick as part of the code that calls 
>> ghes_notify_sea().
>> This means that by the time we return to do_sea() ghes_notify_sea()'s claim 
>> that
>> APEI has dealt with it is true as any generated signals are pending. We can 
>> then
>> skip the existing SIGBUS generation code.
>>
>>
>>> Because memory_failure() may sleep, we can not call it directly in SEA
>>
>> (this one is more serious, I've attempted to fix it by moving all NMI-like
>> GHES-notifications to use the estatus queue).
>>
>>
>>> exception context. So we saved faulting physical address associated with
>>> a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return
>>> from SEA exception context and get into do_notify_resume() before the
>>> process running, we could check it and call memory_failure() to do
>>> recovery.
>>
>>> It's safe, because we are in process context.
>>
>> I think this is the trick. When we take a Synchronous-external-abort out of
>> userspace, we're in process context too. We can add helpers to drain the
>> memory_failure_queue which can be called when do_sea() when we know we're
>> preemptible and interrupts-et-al are unmasked.
> 
> Something like... base on [0], in arch/arm64/kernel/acpi.c:

I am very glad that you are trying to solve the problem, which is very helpful.
I agree with your proposal, and I'll test it on by box latter.

Indeed, we're in precess context when we are in sea handler. I was thought we
can't call schedule() in the exception handler before.

Thank you very much!

> -%<-
> int apei_claim_sea(struct pt_regs *regs)
> {
> int cpu;
> int err = -ENOENT;
> unsigned long current_flags = arch_local_save_flags();
> unsigned long interrupted_flags = current_flags;
> 
> if (!IS_ENABLED(CONFIG_ACPI_APEI_SEA))
> return err;
> 
> if (regs)
> interrupted_flags = regs->pstate;
> 
> /*
>  * APEI expects an NMI-like notification to always be called
>  * in NMI context.
>  */
> local_daif_restore(DAIF_ERRCTX);
> nmi_enter();
> err = ghes_notify_sea();
> cpu = smp_processor_id();
> nmi_exit();
> 
> /*
>  * APEI NMI-like notifications are deferred to irq_work. Unless

[PATCH v5 1/3] arm64/ras: support sea error recovery

2018-01-26 Thread Xie XiuQi

With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
are consumed. According to the existing process, errors occurred in the
kernel, leading to direct panic, if it occurred the user-space, we should
just kill process.

But there is a class of error, in fact, is not necessary to kill
process, you can recover and continue to run the process. Such as
the instruction data corrupted, where the memory page might be
read-only, which is has not been modified, the disk might have the
correct data, so you can directly drop the page, ant reload it when
necessary.

So this patchset is just try to solve such problem: if the error is
consumed in user-space and the error occurs on a clean page, you can
directly drop the memory page without killing process.

If the corrupted page is clean, just dropped it and return to user-space
without side effects. And if corrupted page is dirty, memory_failure()
will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset,
do_sea() will just send SIGBUS, so the process was killed in the same place.

Because memory_failure() may sleep, we can not call it directly in SEA
exception context. So we saved faulting physical address associated with
a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return
from SEA exception context and get into do_notify_resume() before the
process running, we could check it and call memory_failure() to do
recovery. It's safe, because we are in process context.

Signed-off-by: Xie XiuQi <xiexi...@huawei.com>
Tested-by: Wang Xiongfeng <wangxiongfe...@huawei.com>
Tested-by: Tyler Baicar <tbai...@codeaurora.org>
Cc: James Morse <james.mo...@arm.com>
Cc: Borislav Petkov <b...@suse.de>
Cc: Julien Thierry <julien.thie...@arm.com>
Cc: Hanjun Guo <hanjun@linaro.org>
---
 arch/arm64/Kconfig   |  11 +++
 arch/arm64/include/asm/ras.h |  23 ++
 arch/arm64/include/asm/thread_info.h |   4 +-
 arch/arm64/kernel/Makefile   |   1 +
 arch/arm64/kernel/ras.c  | 142 +++
 arch/arm64/kernel/signal.c   |   7 ++
 arch/arm64/mm/fault.c|  27 +--
 drivers/acpi/apei/ghes.c |   8 +-
 include/acpi/ghes.h  |   3 +
 9 files changed, 216 insertions(+), 10 deletions(-)
 create mode 100644 arch/arm64/include/asm/ras.h
 create mode 100644 arch/arm64/kernel/ras.c

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index c9a7e9e..8892923 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -688,6 +688,17 @@ config HOTPLUG_CPU
  Say Y here to experiment with turning CPUs off and on.  CPUs
  can be controlled through /sys/devices/system/cpu.
 
+config ARM64_ERR_RECOV
+   bool "Support arm64 RAS error recovery"
+   depends on ACPI_APEI_SEA && MEMORY_FAILURE
+   help
+ With ARM v8.2 RAS Extension, SEA are usually triggered when memory 
errors
+ are consumed. In some cases, if the error address is in a clean page 
or a
+ read-only page, there is a chance to recover. Such as error occurs in 
a
+ instruction page, we can reread this page from disk instead of 
killing process.
+
+ Say Y if unsure.
+
 # Common NUMA Features
 config NUMA
bool "Numa Memory Allocation and Scheduler Support"
diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h
new file mode 100644
index 000..f0f18da
--- /dev/null
+++ b/arch/arm64/include/asm/ras.h
@@ -0,0 +1,23 @@
+/*
+ * ARM64 SEA error recoery support
+ *
+ * Copyright 2017 Huawei Technologies Co., Ltd.
+ *   Author: Xie XiuQi <xiexi...@huawei.com>
+ *   Author: Wang Xiongfeng <wangxiongfe...@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _ASM_RAS_H
+#define _ASM_RAS_H
+
+extern void sea_notify_process(void);
+
+#endif /*_ASM_RAS_H*/
diff --git a/arch/arm64/include/asm/thread_info.h 
b/arch/arm64/include/asm/thread_info.h
index eb43128..250d769 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -84,6 +84,7 @@ struct thread_info {
 #define TIF_FOREIGN_FPSTATE3   /* CPU's FP state is not current's */
 #define TIF_UPROBE 4   /* uprobe breakpoint or singlestep */
 #define TIF_FSCHECK5   /* Check FS is USER_DS on return */
+#define TIF_SEA_NOTIFY  6   /* notify to do an error recovery */
 #define TIF_NOHZ   7
 #define TIF_SYSCALL_TRACE  8
 #define TIF_SYSCALL_AUDIT

[PATCH v5 1/3] arm64/ras: support sea error recovery

2018-01-26 Thread Xie XiuQi

With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
are consumed. According to the existing process, errors occurred in the
kernel, leading to direct panic, if it occurred the user-space, we should
just kill process.

But there is a class of error, in fact, is not necessary to kill
process, you can recover and continue to run the process. Such as
the instruction data corrupted, where the memory page might be
read-only, which is has not been modified, the disk might have the
correct data, so you can directly drop the page, ant reload it when
necessary.

So this patchset is just try to solve such problem: if the error is
consumed in user-space and the error occurs on a clean page, you can
directly drop the memory page without killing process.

If the corrupted page is clean, just dropped it and return to user-space
without side effects. And if corrupted page is dirty, memory_failure()
will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset,
do_sea() will just send SIGBUS, so the process was killed in the same place.

Because memory_failure() may sleep, we can not call it directly in SEA
exception context. So we saved faulting physical address associated with
a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return
from SEA exception context and get into do_notify_resume() before the
process running, we could check it and call memory_failure() to do
recovery. It's safe, because we are in process context.

Signed-off-by: Xie XiuQi 
Tested-by: Wang Xiongfeng 
Tested-by: Tyler Baicar 
Cc: James Morse 
Cc: Borislav Petkov 
Cc: Julien Thierry 
Cc: Hanjun Guo 
---
 arch/arm64/Kconfig   |  11 +++
 arch/arm64/include/asm/ras.h |  23 ++
 arch/arm64/include/asm/thread_info.h |   4 +-
 arch/arm64/kernel/Makefile   |   1 +
 arch/arm64/kernel/ras.c  | 142 +++
 arch/arm64/kernel/signal.c   |   7 ++
 arch/arm64/mm/fault.c|  27 +--
 drivers/acpi/apei/ghes.c |   8 +-
 include/acpi/ghes.h  |   3 +
 9 files changed, 216 insertions(+), 10 deletions(-)
 create mode 100644 arch/arm64/include/asm/ras.h
 create mode 100644 arch/arm64/kernel/ras.c

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index c9a7e9e..8892923 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -688,6 +688,17 @@ config HOTPLUG_CPU
  Say Y here to experiment with turning CPUs off and on.  CPUs
  can be controlled through /sys/devices/system/cpu.
 
+config ARM64_ERR_RECOV
+   bool "Support arm64 RAS error recovery"
+   depends on ACPI_APEI_SEA && MEMORY_FAILURE
+   help
+ With ARM v8.2 RAS Extension, SEA are usually triggered when memory 
errors
+ are consumed. In some cases, if the error address is in a clean page 
or a
+ read-only page, there is a chance to recover. Such as error occurs in 
a
+ instruction page, we can reread this page from disk instead of 
killing process.
+
+ Say Y if unsure.
+
 # Common NUMA Features
 config NUMA
bool "Numa Memory Allocation and Scheduler Support"
diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h
new file mode 100644
index 000..f0f18da
--- /dev/null
+++ b/arch/arm64/include/asm/ras.h
@@ -0,0 +1,23 @@
+/*
+ * ARM64 SEA error recoery support
+ *
+ * Copyright 2017 Huawei Technologies Co., Ltd.
+ *   Author: Xie XiuQi 
+ *   Author: Wang Xiongfeng 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _ASM_RAS_H
+#define _ASM_RAS_H
+
+extern void sea_notify_process(void);
+
+#endif /*_ASM_RAS_H*/
diff --git a/arch/arm64/include/asm/thread_info.h 
b/arch/arm64/include/asm/thread_info.h
index eb43128..250d769 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -84,6 +84,7 @@ struct thread_info {
 #define TIF_FOREIGN_FPSTATE3   /* CPU's FP state is not current's */
 #define TIF_UPROBE 4   /* uprobe breakpoint or singlestep */
 #define TIF_FSCHECK5   /* Check FS is USER_DS on return */
+#define TIF_SEA_NOTIFY  6   /* notify to do an error recovery */
 #define TIF_NOHZ   7
 #define TIF_SYSCALL_TRACE  8
 #define TIF_SYSCALL_AUDIT  9
@@ -102,6 +103,7 @@ struct thread_info {
 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
 #define _TIF_FOREIGN_FPSTATE   (1 << TIF_FOREIGN_FPSTATE)
 #define _TIF_NOHZ  (1 << TIF_NOHZ)
+#define _TIF_SEA_NOTIFY (1

[PATCH v5 2/3] GHES: add a notify chain for process memory section

2018-01-26 Thread Xie XiuQi

Add a notify chain for process memory section, with
which other modules might do error recovery.

Signed-off-by: Xie XiuQi <xiexi...@huawei.com>
Tested-by: Wang Xiongfeng <wangxiongfe...@huawei.com>
Tested-by: Tyler Baicar <tbai...@codeaurora.org>
---
 drivers/acpi/apei/ghes.c | 10 ++
 include/acpi/ghes.h  |  8 
 2 files changed, 18 insertions(+)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index cff671d..1f0ebfb 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -109,6 +109,9 @@ static inline bool is_hest_type_generic_v2(struct ghes 
*ghes)
 static LIST_HEAD(ghes_hed);
 static DEFINE_MUTEX(ghes_list_mutex);
 
+ATOMIC_NOTIFIER_HEAD(ghes_mem_err_chain);
+EXPORT_SYMBOL(ghes_mem_err_chain);
+
 /*
  * Because the memory area used to transfer hardware error information
  * from BIOS to Linux can be determined only in NMI, IRQ or timer
@@ -441,6 +444,13 @@ static void ghes_do_proc(struct ghes *ghes,
 
if (guid_equal(sec_type, _SEC_PLATFORM_MEM)) {
struct cper_sec_mem_err *mem_err = 
acpi_hest_get_payload(gdata);
+   struct ghes_mem_err mem;
+
+   mem.notify_type = ghes->generic->notify.type;
+   mem.severity = gdata->error_severity;
+   mem.mem_err = mem_err;
+
+   atomic_notifier_call_chain(_mem_err_chain, 0, 
);
 
ghes_edac_report_mem_error(ghes, sev, mem_err);
 
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index d626367..921ebf1 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -124,4 +124,12 @@ static inline void *acpi_hest_get_next(struct 
acpi_hest_generic_data *gdata)
 extern void ghes_arm_process_error(struct ghes *ghes,
   struct cper_sec_proc_arm *err);
 
+struct ghes_mem_err {
+   int notify_type;
+   int severity;
+   struct cper_sec_mem_err *mem_err;
+};
+
+extern struct atomic_notifier_head ghes_mem_err_chain;
+
 #endif /* GHES_H */
-- 
1.8.3.1

[PATCH v5 2/3] GHES: add a notify chain for process memory section

2018-01-26 Thread Xie XiuQi

Add a notify chain for process memory section, with
which other modules might do error recovery.

Signed-off-by: Xie XiuQi 
Tested-by: Wang Xiongfeng 
Tested-by: Tyler Baicar 
---
 drivers/acpi/apei/ghes.c | 10 ++
 include/acpi/ghes.h  |  8 
 2 files changed, 18 insertions(+)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index cff671d..1f0ebfb 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -109,6 +109,9 @@ static inline bool is_hest_type_generic_v2(struct ghes 
*ghes)
 static LIST_HEAD(ghes_hed);
 static DEFINE_MUTEX(ghes_list_mutex);
 
+ATOMIC_NOTIFIER_HEAD(ghes_mem_err_chain);
+EXPORT_SYMBOL(ghes_mem_err_chain);
+
 /*
  * Because the memory area used to transfer hardware error information
  * from BIOS to Linux can be determined only in NMI, IRQ or timer
@@ -441,6 +444,13 @@ static void ghes_do_proc(struct ghes *ghes,
 
if (guid_equal(sec_type, _SEC_PLATFORM_MEM)) {
struct cper_sec_mem_err *mem_err = 
acpi_hest_get_payload(gdata);
+   struct ghes_mem_err mem;
+
+   mem.notify_type = ghes->generic->notify.type;
+   mem.severity = gdata->error_severity;
+   mem.mem_err = mem_err;
+
+   atomic_notifier_call_chain(_mem_err_chain, 0, 
);
 
ghes_edac_report_mem_error(ghes, sev, mem_err);
 
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index d626367..921ebf1 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -124,4 +124,12 @@ static inline void *acpi_hest_get_next(struct 
acpi_hest_generic_data *gdata)
 extern void ghes_arm_process_error(struct ghes *ghes,
   struct cper_sec_proc_arm *err);
 
+struct ghes_mem_err {
+   int notify_type;
+   int severity;
+   struct cper_sec_mem_err *mem_err;
+};
+
+extern struct atomic_notifier_head ghes_mem_err_chain;
+
 #endif /* GHES_H */
-- 
1.8.3.1

[PATCH v5 0/3] arm64/ras: support sea error recovery

2018-01-26 Thread Xie XiuQi

With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
are consumed. According to the existing process, errors occurred in the
kernel, leading to direct panic, if it occurred the user-space, we should
just kill process.

But there is a class of error, in fact, is not necessary to kill
process, you can recover and continue to run the process. Such as
the instruction data corrupted, where the memory page might be
read-only, which is has not been modified, the disk might have the
correct data, so you can directly drop the page, ant reload it when
necessary.

So this patchset is just try to solve such problem: if the error is
consumed in user-space and the error occurs on a clean page, you can
directly drop the memory page without killing process.

If the corrupted page is clean, just dropped it and return to user-space
without side effects. And if corrupted page is dirty, memory_failure()
will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset,
do_sea() will just send SIGBUS, so the process was killed in the same place.

Because memory_failure() may sleep, we can not call it directly in SEA
exception context. So we saved faulting physical address associated with
a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return
from SEA exception context and get into do_notify_resume() before the
process running, we could check it and call memory_failure() to do
recovery. It's safe, because we are in process context.

In some platform, when SEA triggerred, physical address could be reported
by memory section or by processor section, so we save address at this two
place.

---
v5 - v4:
  - rebased on top of 4.15-rc9 + efi patches
efi patches:
https://patchwork.codeaurora.org/patch/415877/
https://patchwork.codeaurora.org/patch/415879/

  - add Tyler & Xiongfeng's Tested-by.

v4 - v3:
  - rebase on top of the latest mainline
  - make ghes_arm_process_error as a weak function
  - only pick cache error from arm processor section for error recovery
  - fix s-o-b issue

  https://lkml.org/lkml/2017/9/7/98

v3 - v2:
  - fix patch style issue

v2 - v1:
  - wrap arm_proc_error_check and log_arm_hw_error in a single 
arm_process_error()
  - fix sea_save_info return value issue
  - fix link error if this CONFIG_ARM64_ERR_RECOV is not selected
  - use a notify chain instead of call arch_apei_report_mem_error directly

  https://lkml.org/lkml/2017/9/1/189

Xie XiuQi (3):
  arm64/ras: support sea error recovery
  GHES: add a notify chain for process memory section
  arm64/ras: save error address from memory section for recovery

 arch/arm64/Kconfig   |  11 +++
 arch/arm64/include/asm/ras.h |  23 +
 arch/arm64/include/asm/thread_info.h |   4 +-
 arch/arm64/kernel/Makefile   |   1 +
 arch/arm64/kernel/ras.c  | 173 +++
 arch/arm64/kernel/signal.c   |   7 ++
 arch/arm64/mm/fault.c|  27 --
 drivers/acpi/apei/ghes.c |  18 +++-
 include/acpi/ghes.h  |  11 +++
 9 files changed, 265 insertions(+), 10 deletions(-)
 create mode 100644 arch/arm64/include/asm/ras.h
 create mode 100644 arch/arm64/kernel/ras.c

-- 
1.8.3.1

[PATCH v5 3/3] arm64/ras: save error address from memory section for recovery

2018-01-26 Thread Xie XiuQi

In some platform, when SEA triggerred, physical address might be
reported by memory section, so we save it for error recovery later.

Signed-off-by: Xie XiuQi <xiexi...@huawei.com>
Tested-by: Wang Xiongfeng <wangxiongfe...@huawei.com>
Tested-by: Tyler Baicar <tbai...@codeaurora.org>
---
 arch/arm64/kernel/ras.c | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/arch/arm64/kernel/ras.c b/arch/arm64/kernel/ras.c
index 05d3871..4690193 100644
--- a/arch/arm64/kernel/ras.c
+++ b/arch/arm64/kernel/ras.c
@@ -140,3 +140,34 @@ void ghes_arm_process_error(struct ghes *ghes, struct 
cper_sec_proc_arm *err)
if (info_saved)
set_thread_flag(TIF_SEA_NOTIFY);
 }
+
+int ghes_mem_err_callback(struct notifier_block *nb, unsigned long val, void 
*data)
+{
+   bool info_saved = false;
+   struct ghes_mem_err *ghes_mem = (struct ghes_mem_err *)data;
+   struct cper_sec_mem_err *mem_err = ghes_mem->mem_err;
+
+   if ((ghes_mem->notify_type != ACPI_HEST_NOTIFY_SEA) ||
+   (ghes_mem->severity != CPER_SEV_RECOVERABLE))
+   return 0;
+
+   if (mem_err->validation_bits & CPER_MEM_VALID_PA)
+   info_saved = sea_save_info(mem_err->physical_addr);
+
+   if (info_saved)
+   set_thread_flag(TIF_SEA_NOTIFY);
+
+   return 0;
+}
+
+static struct notifier_block ghes_mem_err_nb = {
+   .notifier_call  = ghes_mem_err_callback,
+};
+
+static int arm64_err_recov_init(void)
+{
+   atomic_notifier_chain_register(_mem_err_chain, _mem_err_nb);
+   return 0;
+}
+
+late_initcall(arm64_err_recov_init);
-- 
1.8.3.1

[PATCH v5 0/3] arm64/ras: support sea error recovery

2018-01-26 Thread Xie XiuQi

With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
are consumed. According to the existing process, errors occurred in the
kernel, leading to direct panic, if it occurred the user-space, we should
just kill process.

But there is a class of error, in fact, is not necessary to kill
process, you can recover and continue to run the process. Such as
the instruction data corrupted, where the memory page might be
read-only, which is has not been modified, the disk might have the
correct data, so you can directly drop the page, ant reload it when
necessary.

So this patchset is just try to solve such problem: if the error is
consumed in user-space and the error occurs on a clean page, you can
directly drop the memory page without killing process.

If the corrupted page is clean, just dropped it and return to user-space
without side effects. And if corrupted page is dirty, memory_failure()
will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset,
do_sea() will just send SIGBUS, so the process was killed in the same place.

Because memory_failure() may sleep, we can not call it directly in SEA
exception context. So we saved faulting physical address associated with
a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return
from SEA exception context and get into do_notify_resume() before the
process running, we could check it and call memory_failure() to do
recovery. It's safe, because we are in process context.

In some platform, when SEA triggerred, physical address could be reported
by memory section or by processor section, so we save address at this two
place.

---
v5 - v4:
  - rebased on top of 4.15-rc9 + efi patches
efi patches:
https://patchwork.codeaurora.org/patch/415877/
https://patchwork.codeaurora.org/patch/415879/

  - add Tyler & Xiongfeng's Tested-by.

v4 - v3:
  - rebase on top of the latest mainline
  - make ghes_arm_process_error as a weak function
  - only pick cache error from arm processor section for error recovery
  - fix s-o-b issue

  https://lkml.org/lkml/2017/9/7/98

v3 - v2:
  - fix patch style issue

v2 - v1:
  - wrap arm_proc_error_check and log_arm_hw_error in a single 
arm_process_error()
  - fix sea_save_info return value issue
  - fix link error if this CONFIG_ARM64_ERR_RECOV is not selected
  - use a notify chain instead of call arch_apei_report_mem_error directly

  https://lkml.org/lkml/2017/9/1/189

Xie XiuQi (3):
  arm64/ras: support sea error recovery
  GHES: add a notify chain for process memory section
  arm64/ras: save error address from memory section for recovery

 arch/arm64/Kconfig   |  11 +++
 arch/arm64/include/asm/ras.h |  23 +
 arch/arm64/include/asm/thread_info.h |   4 +-
 arch/arm64/kernel/Makefile   |   1 +
 arch/arm64/kernel/ras.c  | 173 +++
 arch/arm64/kernel/signal.c   |   7 ++
 arch/arm64/mm/fault.c|  27 --
 drivers/acpi/apei/ghes.c |  18 +++-
 include/acpi/ghes.h  |  11 +++
 9 files changed, 265 insertions(+), 10 deletions(-)
 create mode 100644 arch/arm64/include/asm/ras.h
 create mode 100644 arch/arm64/kernel/ras.c

-- 
1.8.3.1

[PATCH v5 3/3] arm64/ras: save error address from memory section for recovery

2018-01-26 Thread Xie XiuQi

In some platform, when SEA triggerred, physical address might be
reported by memory section, so we save it for error recovery later.

Signed-off-by: Xie XiuQi 
Tested-by: Wang Xiongfeng 
Tested-by: Tyler Baicar 
---
 arch/arm64/kernel/ras.c | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/arch/arm64/kernel/ras.c b/arch/arm64/kernel/ras.c
index 05d3871..4690193 100644
--- a/arch/arm64/kernel/ras.c
+++ b/arch/arm64/kernel/ras.c
@@ -140,3 +140,34 @@ void ghes_arm_process_error(struct ghes *ghes, struct 
cper_sec_proc_arm *err)
if (info_saved)
set_thread_flag(TIF_SEA_NOTIFY);
 }
+
+int ghes_mem_err_callback(struct notifier_block *nb, unsigned long val, void 
*data)
+{
+   bool info_saved = false;
+   struct ghes_mem_err *ghes_mem = (struct ghes_mem_err *)data;
+   struct cper_sec_mem_err *mem_err = ghes_mem->mem_err;
+
+   if ((ghes_mem->notify_type != ACPI_HEST_NOTIFY_SEA) ||
+   (ghes_mem->severity != CPER_SEV_RECOVERABLE))
+   return 0;
+
+   if (mem_err->validation_bits & CPER_MEM_VALID_PA)
+   info_saved = sea_save_info(mem_err->physical_addr);
+
+   if (info_saved)
+   set_thread_flag(TIF_SEA_NOTIFY);
+
+   return 0;
+}
+
+static struct notifier_block ghes_mem_err_nb = {
+   .notifier_call  = ghes_mem_err_callback,
+};
+
+static int arm64_err_recov_init(void)
+{
+   atomic_notifier_chain_register(_mem_err_chain, _mem_err_nb);
+   return 0;
+}
+
+late_initcall(arm64_err_recov_init);
-- 
1.8.3.1

Re: [PATCH v4 0/4] arm64/ras: support sea error recovery

2018-01-25 Thread Xie XiuQi

Hi Tyler,

On 2018/1/26 1:11, Tyler Baicar wrote:
> For this series - Tested-by: Tyler Baicar <tbai...@codeaurora.org>
> 
> Note that this will probably need to be rebased on top of these patches:

Thank you for your testing, I'll rebase then.

> 
> https://patchwork.codeaurora.org/patch/415877/
> https://patchwork.codeaurora.org/patch/415879/
> 
> With that, the first patch should be able to be removed because the above 
> patches already define the ARM error types:
> 
> +#define CPER_ARM_CACHE_ERROR0
> +#define CPER_ARM_TLB_ERROR1
> +#define CPER_ARM_BUS_ERROR2
> +#define CPER_ARM_VENDOR_ERROR3

-- 
Thanks,
Xie XiuQi

Re: [PATCH v4 0/4] arm64/ras: support sea error recovery

2018-01-25 Thread Xie XiuQi

Hi Tyler,

On 2018/1/26 1:11, Tyler Baicar wrote:
> For this series - Tested-by: Tyler Baicar 
> 
> Note that this will probably need to be rebased on top of these patches:

Thank you for your testing, I'll rebase then.

> 
> https://patchwork.codeaurora.org/patch/415877/
> https://patchwork.codeaurora.org/patch/415879/
> 
> With that, the first patch should be able to be removed because the above 
> patches already define the ARM error types:
> 
> +#define CPER_ARM_CACHE_ERROR0
> +#define CPER_ARM_TLB_ERROR1
> +#define CPER_ARM_BUS_ERROR2
> +#define CPER_ARM_VENDOR_ERROR3

-- 
Thanks,
Xie XiuQi

Re: [RESEND PATCH V2] arm64: fault: avoid send SIGBUS two times

2017-12-11 Thread Xie XiuQi

Hi Dongjiu,

On 2017/12/12 0:05, Dongjiu Geng wrote:
> do_sea() calls arm64_notify_die() which will always signal
> user-space. It also returns whether APEI claimed the external
> abort as a RAS notification. If it returns failure do_mem_abort()
> will signal user-space too.
> 
> do_mem_abort() wants to know if we handled the error, we always
> call arm64_notify_die() so can always return success.
> 
> Signed-off-by: Dongjiu Geng <gengdong...@huawei.com>
> ---
> 1. Address James's comments to update the commit messages
> 2. Address James's comments to not change the si_code for SIGBUS
> ---
>  arch/arm64/mm/fault.c | 7 +++
>  1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
> index b64958b..38b9f3e 100644
> --- a/arch/arm64/mm/fault.c
> +++ b/arch/arm64/mm/fault.c
> @@ -610,7 +610,6 @@ static int do_sea(unsigned long addr, unsigned int esr, 
> struct pt_regs *regs)
>  {
>   struct siginfo info;
>   const struct fault_info *inf;
> - int ret = 0;
>  
>   inf = esr_to_fault_info(esr);
>   pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n",
> @@ -625,7 +624,7 @@ static int do_sea(unsigned long addr, unsigned int esr, 
> struct pt_regs *regs)
>   if (interrupts_enabled(regs))
>   nmi_enter();
>  
> - ret = ghes_notify_sea();
> + ghes_notify_sea();
>  
>   if (interrupts_enabled(regs))
>   nmi_exit();
> @@ -640,7 +639,7 @@ static int do_sea(unsigned long addr, unsigned int esr, 
> struct pt_regs *regs)
>   info.si_addr  = (void __user *)addr;
>   arm64_notify_die("", regs, , esr);
>  
> - return ret;
> + return 0;

It looks good to me. do_sea() has done all necessary action for SEA, so it 
should always return 0,
no matter ghes_notify_sea() return true or false.

Reviewed-by: Xie XiuQi <xiexi...@huawei.com>

>  }
>  
>  static const struct fault_info fault_info[] = {
> --
> 2.10.1
> 

-- 
Thanks,
Xie XiuQi

Re: [RESEND PATCH V2] arm64: fault: avoid send SIGBUS two times

2017-12-11 Thread Xie XiuQi

Hi Dongjiu,

On 2017/12/12 0:05, Dongjiu Geng wrote:
> do_sea() calls arm64_notify_die() which will always signal
> user-space. It also returns whether APEI claimed the external
> abort as a RAS notification. If it returns failure do_mem_abort()
> will signal user-space too.
> 
> do_mem_abort() wants to know if we handled the error, we always
> call arm64_notify_die() so can always return success.
> 
> Signed-off-by: Dongjiu Geng 
> ---
> 1. Address James's comments to update the commit messages
> 2. Address James's comments to not change the si_code for SIGBUS
> ---
>  arch/arm64/mm/fault.c | 7 +++
>  1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
> index b64958b..38b9f3e 100644
> --- a/arch/arm64/mm/fault.c
> +++ b/arch/arm64/mm/fault.c
> @@ -610,7 +610,6 @@ static int do_sea(unsigned long addr, unsigned int esr, 
> struct pt_regs *regs)
>  {
>   struct siginfo info;
>   const struct fault_info *inf;
> - int ret = 0;
>  
>   inf = esr_to_fault_info(esr);
>   pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n",
> @@ -625,7 +624,7 @@ static int do_sea(unsigned long addr, unsigned int esr, 
> struct pt_regs *regs)
>   if (interrupts_enabled(regs))
>   nmi_enter();
>  
> - ret = ghes_notify_sea();
> + ghes_notify_sea();
>  
>   if (interrupts_enabled(regs))
>   nmi_exit();
> @@ -640,7 +639,7 @@ static int do_sea(unsigned long addr, unsigned int esr, 
> struct pt_regs *regs)
>   info.si_addr  = (void __user *)addr;
>   arm64_notify_die("", regs, , esr);
>  
> - return ret;
> + return 0;

It looks good to me. do_sea() has done all necessary action for SEA, so it 
should always return 0,
no matter ghes_notify_sea() return true or false.

Reviewed-by: Xie XiuQi 

>  }
>  
>  static const struct fault_info fault_info[] = {
> --
> 2.10.1
> 

-- 
Thanks,
Xie XiuQi

[tip:ras/core] x86/MCE: Extend table to report action optional errors through CMCI too

2017-12-04 Thread tip-bot for Xie XiuQi

Commit-ID:  e085ac7a6ddbd746966083c5e13aa290c3e9a253
Gitweb: https://git.kernel.org/tip/e085ac7a6ddbd746966083c5e13aa290c3e9a253
Author: Xie XiuQi <xiexi...@huawei.com>
AuthorDate: Mon, 4 Dec 2017 17:54:37 +0100
Committer:  Thomas Gleixner <t...@linutronix.de>
CommitDate: Mon, 4 Dec 2017 20:38:44 +0100

x86/MCE: Extend table to report action optional errors through CMCI too

According to the Intel SDM Volume 3B (253669-063US, July 2017), action
optional (SRAO) errors can be reported either via MCE or CMC:

  In cases when SRAO is signaled via CMCI the error signature is
  indicated via UC=1, PCC=0, S=0.

  Type(*1)  UC  EN  PCC S   AR  Signaling
  ---
  UC1   1   1   x   x   MCE
  SRAR  1   1   0   1   1   MCE
  SRAO  1   x(*2)   0   x(*2)   0   MCE/CMC
  UCNA  1   x   0   0   0   CMC
  CE0   x   x   x   x   CMC

  NOTES:
  1. SRAR, SRAO and UCNA errors are supported by the processor only
 when IA32_MCG_CAP[24] (MCG_SER_P) is set.
  2. EN=1, S=1 when signaled via MCE. EN=x, S=0 when signaled via CMC.

And there is a description in 15.6.2 UCR Error Reporting and Logging, for
bit S:

  S (Signaling) flag, bit 56 - Indicates (when set) that a machine check
  exception was generated for the UCR error reported in this MC bank...
  When the S flag in the IA32_MCi_STATUS register is clear, this UCR error
  was not signaled via a machine check exception and instead was reported
  as a corrected machine check (CMC).

So merge the two cases and just remove the S=0 check for SRAO in
mce_severity().

[ Borislav: Massage commit message.]

Signed-off-by: Xie XiuQi <xiexi...@huawei.com>
Signed-off-by: Borislav Petkov <b...@suse.de>
Signed-off-by: Thomas Gleixner <t...@linutronix.de>
Reviewed-by: Tony Luck <tony.l...@intel.com>
Tested-by: Chen Wei <chenwe...@huawei.com>
Cc: linux-edac <linux-e...@vger.kernel.org>
Link: 
http://lkml.kernel.org/r/1511575548-41992-1-git-send-email-xiexi...@huawei.com

---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 26 +-
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c 
b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 4ca632a..5bbd06f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -59,6 +59,7 @@ static struct severity {
 #define  MCGMASK(x, y) .mcgmask = x, .mcgres = y
 #define  MASK(x, y).mask = x, .result = y
 #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
+#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR)
 #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
 #defineMCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
 
@@ -101,6 +102,22 @@ static struct severity {
NOSER, BITCLR(MCI_STATUS_UC)
),
 
+   /*
+* known AO MCACODs reported via MCE or CMC:
+*
+* SRAO could be signaled either via a machine check exception or
+* CMCI with the corresponding bit S 1 or 0. So we don't need to
+* check bit S for SRAO.
+*/
+   MCESEV(
+   AO, "Action optional: memory scrubbing error",
+   SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, 
MCI_STATUS_UC|MCACOD_SCRUB)
+   ),
+   MCESEV(
+   AO, "Action optional: last level cache writeback error",
+   SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, 
MCI_STATUS_UC|MCACOD_L3WB)
+   ),
+
/* ignore OVER for UCNA */
MCESEV(
UCNA, "Uncorrected no action required",
@@ -149,15 +166,6 @@ static struct severity {
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
),
 
-   /* known AO MCACODs: */
-   MCESEV(
-   AO, "Action optional: memory scrubbing error",
-   SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, 
MCI_UC_S|MCACOD_SCRUB)
-   ),
-   MCESEV(
-   AO, "Action optional: last level cache writeback error",
-   SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, 
MCI_UC_S|MCACOD_L3WB)
-   ),
MCESEV(
SOME, "Action optional: unknown MCACOD",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)

[tip:ras/core] x86/MCE: Extend table to report action optional errors through CMCI too

2017-12-04 Thread tip-bot for Xie XiuQi

Commit-ID:  e085ac7a6ddbd746966083c5e13aa290c3e9a253
Gitweb: https://git.kernel.org/tip/e085ac7a6ddbd746966083c5e13aa290c3e9a253
Author: Xie XiuQi 
AuthorDate: Mon, 4 Dec 2017 17:54:37 +0100
Committer:  Thomas Gleixner 
CommitDate: Mon, 4 Dec 2017 20:38:44 +0100

x86/MCE: Extend table to report action optional errors through CMCI too

According to the Intel SDM Volume 3B (253669-063US, July 2017), action
optional (SRAO) errors can be reported either via MCE or CMC:

  In cases when SRAO is signaled via CMCI the error signature is
  indicated via UC=1, PCC=0, S=0.

  Type(*1)  UC  EN  PCC S   AR  Signaling
  ---
  UC1   1   1   x   x   MCE
  SRAR  1   1   0   1   1   MCE
  SRAO  1   x(*2)   0   x(*2)   0   MCE/CMC
  UCNA  1   x   0   0   0   CMC
  CE0   x   x   x   x   CMC

  NOTES:
  1. SRAR, SRAO and UCNA errors are supported by the processor only
 when IA32_MCG_CAP[24] (MCG_SER_P) is set.
  2. EN=1, S=1 when signaled via MCE. EN=x, S=0 when signaled via CMC.

And there is a description in 15.6.2 UCR Error Reporting and Logging, for
bit S:

  S (Signaling) flag, bit 56 - Indicates (when set) that a machine check
  exception was generated for the UCR error reported in this MC bank...
  When the S flag in the IA32_MCi_STATUS register is clear, this UCR error
  was not signaled via a machine check exception and instead was reported
  as a corrected machine check (CMC).

So merge the two cases and just remove the S=0 check for SRAO in
mce_severity().

[ Borislav: Massage commit message.]

Signed-off-by: Xie XiuQi 
Signed-off-by: Borislav Petkov 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Tony Luck 
Tested-by: Chen Wei 
Cc: linux-edac 
Link: 
http://lkml.kernel.org/r/1511575548-41992-1-git-send-email-xiexi...@huawei.com

---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 26 +-
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c 
b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 4ca632a..5bbd06f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -59,6 +59,7 @@ static struct severity {
 #define  MCGMASK(x, y) .mcgmask = x, .mcgres = y
 #define  MASK(x, y).mask = x, .result = y
 #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
+#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR)
 #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
 #defineMCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
 
@@ -101,6 +102,22 @@ static struct severity {
NOSER, BITCLR(MCI_STATUS_UC)
),
 
+   /*
+* known AO MCACODs reported via MCE or CMC:
+*
+* SRAO could be signaled either via a machine check exception or
+* CMCI with the corresponding bit S 1 or 0. So we don't need to
+* check bit S for SRAO.
+*/
+   MCESEV(
+   AO, "Action optional: memory scrubbing error",
+   SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, 
MCI_STATUS_UC|MCACOD_SCRUB)
+   ),
+   MCESEV(
+   AO, "Action optional: last level cache writeback error",
+   SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, 
MCI_STATUS_UC|MCACOD_L3WB)
+   ),
+
/* ignore OVER for UCNA */
MCESEV(
UCNA, "Uncorrected no action required",
@@ -149,15 +166,6 @@ static struct severity {
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
),
 
-   /* known AO MCACODs: */
-   MCESEV(
-   AO, "Action optional: memory scrubbing error",
-   SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, 
MCI_UC_S|MCACOD_SCRUB)
-   ),
-   MCESEV(
-   AO, "Action optional: last level cache writeback error",
-   SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, 
MCI_UC_S|MCACOD_L3WB)
-   ),
MCESEV(
SOME, "Action optional: unknown MCACOD",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)

[PATCH resend] trace/xdp: fix compile warning: 'struct bpf_map' declared inside parameter list

2017-11-29 Thread Xie XiuQi

We meet this compile warning, which caused by missing bpf.h in xdp.h.

In file included from ./include/trace/events/xdp.h:10:0,
 from ./include/linux/bpf_trace.h:6,
 from drivers/net/ethernet/intel/i40e/i40e_txrx.c:29:
./include/trace/events/xdp.h:93:17: warning: ‘struct bpf_map’ declared inside 
parameter list will not be visible outside of this definition or declaration
const struct bpf_map *map, u32 map_index),
 ^
./include/linux/tracepoint.h:187:34: note: in definition of macro 
‘__DECLARE_TRACE’
  static inline void trace_##name(proto)\
  ^
./include/linux/tracepoint.h:352:24: note: in expansion of macro ‘PARAMS’
  __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),  \
^~
./include/linux/tracepoint.h:477:2: note: in expansion of macro ‘DECLARE_TRACE’
  DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
  ^
./include/linux/tracepoint.h:477:22: note: in expansion of macro ‘PARAMS’
  DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
  ^~
./include/trace/events/xdp.h:89:1: note: in expansion of macro ‘DEFINE_EVENT’
 DEFINE_EVENT(xdp_redirect_template, xdp_redirect,
 ^~~~
./include/trace/events/xdp.h:90:2: note: in expansion of macro ‘TP_PROTO’
  TP_PROTO(const struct net_device *dev,
  ^~~~
./include/trace/events/xdp.h:93:17: warning: ‘struct bpf_map’ declared inside 
parameter list will not be visible outside of this definition or declaration
const struct bpf_map *map, u32 map_index),
 ^
./include/linux/tracepoint.h:203:38: note: in definition of macro 
‘__DECLARE_TRACE’
  register_trace_##name(void (*probe)(data_proto), void *data) \
  ^~
./include/linux/tracepoint.h:354:4: note: in expansion of macro ‘PARAMS’
PARAMS(void *__data, proto),   \
^~

Reported-by: Huang Daode <huangda...@hisilicon.com>
Cc: Hanjun Guo <guohan...@huawei.com>
Fixes: 8d3b778ff544 ("xdp: tracepoint xdp_redirect also need a map argument")
Signed-off-by: Xie XiuQi <xiexi...@huawei.com>
Acked-by: Jesper Dangaard Brouer <bro...@redhat.com>
Acked-by: Steven Rostedt (VMware) <rost...@goodmis.org>
---
 include/trace/events/xdp.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 4cd0f05..8989a92 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define __XDP_ACT_MAP(FN)  \
FN(ABORTED) \
-- 
1.8.3.1

[PATCH resend] trace/xdp: fix compile warning: 'struct bpf_map' declared inside parameter list

2017-11-29 Thread Xie XiuQi

We meet this compile warning, which caused by missing bpf.h in xdp.h.

In file included from ./include/trace/events/xdp.h:10:0,
 from ./include/linux/bpf_trace.h:6,
 from drivers/net/ethernet/intel/i40e/i40e_txrx.c:29:
./include/trace/events/xdp.h:93:17: warning: ‘struct bpf_map’ declared inside 
parameter list will not be visible outside of this definition or declaration
const struct bpf_map *map, u32 map_index),
 ^
./include/linux/tracepoint.h:187:34: note: in definition of macro 
‘__DECLARE_TRACE’
  static inline void trace_##name(proto)\
  ^
./include/linux/tracepoint.h:352:24: note: in expansion of macro ‘PARAMS’
  __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),  \
^~
./include/linux/tracepoint.h:477:2: note: in expansion of macro ‘DECLARE_TRACE’
  DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
  ^
./include/linux/tracepoint.h:477:22: note: in expansion of macro ‘PARAMS’
  DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
  ^~
./include/trace/events/xdp.h:89:1: note: in expansion of macro ‘DEFINE_EVENT’
 DEFINE_EVENT(xdp_redirect_template, xdp_redirect,
 ^~~~
./include/trace/events/xdp.h:90:2: note: in expansion of macro ‘TP_PROTO’
  TP_PROTO(const struct net_device *dev,
  ^~~~
./include/trace/events/xdp.h:93:17: warning: ‘struct bpf_map’ declared inside 
parameter list will not be visible outside of this definition or declaration
const struct bpf_map *map, u32 map_index),
 ^
./include/linux/tracepoint.h:203:38: note: in definition of macro 
‘__DECLARE_TRACE’
  register_trace_##name(void (*probe)(data_proto), void *data) \
  ^~
./include/linux/tracepoint.h:354:4: note: in expansion of macro ‘PARAMS’
PARAMS(void *__data, proto),   \
^~

Reported-by: Huang Daode 
Cc: Hanjun Guo 
Fixes: 8d3b778ff544 ("xdp: tracepoint xdp_redirect also need a map argument")
Signed-off-by: Xie XiuQi 
Acked-by: Jesper Dangaard Brouer 
Acked-by: Steven Rostedt (VMware) 
---
 include/trace/events/xdp.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 4cd0f05..8989a92 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define __XDP_ACT_MAP(FN)  \
FN(ABORTED) \
-- 
1.8.3.1

Re: [PATCH] trace/xdp: fix compile warning: ‘struct bpf_map’ declared inside parameter list

2017-11-29 Thread Xie XiuQi


On 2017/11/29 19:13, Daniel Borkmann wrote:
> Xie, thanks for the patch! We could route this fix via bpf tree if you want.
> 
> Could you resend your patch with below Fixes and Acked-by tag added to
> net...@vger.kernel.org in Cc, so that it ends up in patchwork there?
> 

Sure, I'll resend soon.

>> Fixes: 8d3b778ff544 ("xdp: tracepoint xdp_redirect also need a map argument")
>>
>> Acked-by: Jesper Dangaard Brouer <bro...@redhat.com>
> Thanks,
> Daniel

-- 
Thanks,
Xie XiuQi

Re: [PATCH] trace/xdp: fix compile warning: ‘struct bpf_map’ declared inside parameter list

2017-11-29 Thread Xie XiuQi


On 2017/11/29 19:13, Daniel Borkmann wrote:
> Xie, thanks for the patch! We could route this fix via bpf tree if you want.
> 
> Could you resend your patch with below Fixes and Acked-by tag added to
> net...@vger.kernel.org in Cc, so that it ends up in patchwork there?
> 

Sure, I'll resend soon.

>> Fixes: 8d3b778ff544 ("xdp: tracepoint xdp_redirect also need a map argument")
>>
>> Acked-by: Jesper Dangaard Brouer 
> Thanks,
> Daniel

-- 
Thanks,
Xie XiuQi

[PATCH] trace/xdp: fix compile warning: ‘struct bpf_map’ declared inside parameter list

2017-11-29 Thread Xie XiuQi

We meet this compile warning, which caused by missing bpf.h in xdp.h.

In file included from ./include/trace/events/xdp.h:10:0,
 from ./include/linux/bpf_trace.h:6,
 from drivers/net/ethernet/intel/i40e/i40e_txrx.c:29:
./include/trace/events/xdp.h:93:17: warning: ‘struct bpf_map’ declared inside 
parameter list will not be visible outside of this definition or declaration
const struct bpf_map *map, u32 map_index),
 ^
./include/linux/tracepoint.h:187:34: note: in definition of macro 
‘__DECLARE_TRACE’
  static inline void trace_##name(proto)\
  ^
./include/linux/tracepoint.h:352:24: note: in expansion of macro ‘PARAMS’
  __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),  \
^~
./include/linux/tracepoint.h:477:2: note: in expansion of macro ‘DECLARE_TRACE’
  DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
  ^
./include/linux/tracepoint.h:477:22: note: in expansion of macro ‘PARAMS’
  DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
  ^~
./include/trace/events/xdp.h:89:1: note: in expansion of macro ‘DEFINE_EVENT’
 DEFINE_EVENT(xdp_redirect_template, xdp_redirect,
 ^~~~
./include/trace/events/xdp.h:90:2: note: in expansion of macro ‘TP_PROTO’
  TP_PROTO(const struct net_device *dev,
  ^~~~
./include/trace/events/xdp.h:93:17: warning: ‘struct bpf_map’ declared inside 
parameter list will not be visible outside of this definition or declaration
const struct bpf_map *map, u32 map_index),
 ^
./include/linux/tracepoint.h:203:38: note: in definition of macro 
‘__DECLARE_TRACE’
  register_trace_##name(void (*probe)(data_proto), void *data) \
  ^~
./include/linux/tracepoint.h:354:4: note: in expansion of macro ‘PARAMS’
PARAMS(void *__data, proto),   \
^~

Reported-by: Huang Daode <huangda...@hisilicon.com>
Cc: Hanjun Guo <guohan...@huawei.com>
Signed-off-by: Xie XiuQi <xiexi...@huawei.com>
---
 include/trace/events/xdp.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 4cd0f05..8989a92 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define __XDP_ACT_MAP(FN)  \
FN(ABORTED) \
-- 
1.8.3.1

[PATCH] trace/xdp: fix compile warning: ‘struct bpf_map’ declared inside parameter list

2017-11-29 Thread Xie XiuQi

We meet this compile warning, which caused by missing bpf.h in xdp.h.

In file included from ./include/trace/events/xdp.h:10:0,
 from ./include/linux/bpf_trace.h:6,
 from drivers/net/ethernet/intel/i40e/i40e_txrx.c:29:
./include/trace/events/xdp.h:93:17: warning: ‘struct bpf_map’ declared inside 
parameter list will not be visible outside of this definition or declaration
const struct bpf_map *map, u32 map_index),
 ^
./include/linux/tracepoint.h:187:34: note: in definition of macro 
‘__DECLARE_TRACE’
  static inline void trace_##name(proto)\
  ^
./include/linux/tracepoint.h:352:24: note: in expansion of macro ‘PARAMS’
  __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),  \
^~
./include/linux/tracepoint.h:477:2: note: in expansion of macro ‘DECLARE_TRACE’
  DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
  ^
./include/linux/tracepoint.h:477:22: note: in expansion of macro ‘PARAMS’
  DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
  ^~
./include/trace/events/xdp.h:89:1: note: in expansion of macro ‘DEFINE_EVENT’
 DEFINE_EVENT(xdp_redirect_template, xdp_redirect,
 ^~~~
./include/trace/events/xdp.h:90:2: note: in expansion of macro ‘TP_PROTO’
  TP_PROTO(const struct net_device *dev,
  ^~~~
./include/trace/events/xdp.h:93:17: warning: ‘struct bpf_map’ declared inside 
parameter list will not be visible outside of this definition or declaration
const struct bpf_map *map, u32 map_index),
 ^
./include/linux/tracepoint.h:203:38: note: in definition of macro 
‘__DECLARE_TRACE’
  register_trace_##name(void (*probe)(data_proto), void *data) \
  ^~
./include/linux/tracepoint.h:354:4: note: in expansion of macro ‘PARAMS’
PARAMS(void *__data, proto),   \
^~

Reported-by: Huang Daode 
Cc: Hanjun Guo 
Signed-off-by: Xie XiuQi 
---
 include/trace/events/xdp.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 4cd0f05..8989a92 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define __XDP_ACT_MAP(FN)  \
FN(ABORTED) \
-- 
1.8.3.1

[PATCH v2] x86/mce: add support SRAO reported via CMC check

2017-11-24 Thread Xie XiuQi

In Intel SDM Volume 3B (253669-063US, July 2017), SRAO could be
reported either via MCE or CMC:

  In cases when SRAO is signaled via CMCI the error signature is
  indicated via UC=1, PCC=0, S=0.

  Type(*1)  UC  EN  PCC S   AR  Signaling
  ---
  UC1   1   1   x   x   MCE
  SRAR  1   1   0   1   1   MCE
  SRAO  1   x(*2)   0   x(*2)   0   MCE/CMC
  UCNA  1   x   0   0   0   CMC
  CE0   x   x   x   x   CMC

  NOTES:
  1. SRAR, SRAO and UCNA errors are supported by the processor only
 when IA32_MCG_CAP[24] (MCG_SER_P) is set.
  2. EN=1, S=1 when signaled via MCE. EN=x, S=0 when signaled via CMC.

And there is a description in 15.6.2 UCR Error Reporting and Logging, for bit S:

  S (Signaling) flag, bit 56 - Indicates (when set) that a machine check
  exception was generated for the UCR error reported in this MC bank...
  When the S flag in the IA32_MCi_STATUS register is clear, this UCR error
  was not signaled via a machine check exception and instead was reported
  as a corrected machine check (CMC).

So we could merge this two case, and just remove the S=0 check for SRAO
in mce_severity().

---
v2: add OVER=0 check and merge MCE and CMC case.

Signed-off-by: Xie XiuQi <xiexi...@huawei.com>
Tested-by: Chen Wei <chenwe...@huawei.com>
Reviewed-by: Tony Luck <tony.l...@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 26 +-
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c 
b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 4ca632a..5bbd06f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -59,6 +59,7 @@
 #define  MCGMASK(x, y) .mcgmask = x, .mcgres = y
 #define  MASK(x, y).mask = x, .result = y
 #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
+#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR)
 #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
 #defineMCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
 
@@ -101,6 +102,22 @@
NOSER, BITCLR(MCI_STATUS_UC)
),
 
+   /*
+* known AO MCACODs reported via MCE or CMC:
+*
+* SRAO could be signaled either via a machine check exception or
+* CMCI with the corresponding bit S 1 or 0. So we don't need to
+* check bit S for SRAO.
+*/
+   MCESEV(
+   AO, "Action optional: memory scrubbing error",
+   SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, 
MCI_STATUS_UC|MCACOD_SCRUB)
+   ),
+   MCESEV(
+   AO, "Action optional: last level cache writeback error",
+   SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, 
MCI_STATUS_UC|MCACOD_L3WB)
+   ),
+
/* ignore OVER for UCNA */
MCESEV(
UCNA, "Uncorrected no action required",
@@ -149,15 +166,6 @@
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
),
 
-   /* known AO MCACODs: */
-   MCESEV(
-   AO, "Action optional: memory scrubbing error",
-   SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, 
MCI_UC_S|MCACOD_SCRUB)
-   ),
-   MCESEV(
-   AO, "Action optional: last level cache writeback error",
-   SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, 
MCI_UC_S|MCACOD_L3WB)
-   ),
MCESEV(
SOME, "Action optional: unknown MCACOD",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
-- 
1.8.3.1

[PATCH v2] x86/mce: add support SRAO reported via CMC check

2017-11-24 Thread Xie XiuQi

In Intel SDM Volume 3B (253669-063US, July 2017), SRAO could be
reported either via MCE or CMC:

  In cases when SRAO is signaled via CMCI the error signature is
  indicated via UC=1, PCC=0, S=0.

  Type(*1)  UC  EN  PCC S   AR  Signaling
  ---
  UC1   1   1   x   x   MCE
  SRAR  1   1   0   1   1   MCE
  SRAO  1   x(*2)   0   x(*2)   0   MCE/CMC
  UCNA  1   x   0   0   0   CMC
  CE0   x   x   x   x   CMC

  NOTES:
  1. SRAR, SRAO and UCNA errors are supported by the processor only
 when IA32_MCG_CAP[24] (MCG_SER_P) is set.
  2. EN=1, S=1 when signaled via MCE. EN=x, S=0 when signaled via CMC.

And there is a description in 15.6.2 UCR Error Reporting and Logging, for bit S:

  S (Signaling) flag, bit 56 - Indicates (when set) that a machine check
  exception was generated for the UCR error reported in this MC bank...
  When the S flag in the IA32_MCi_STATUS register is clear, this UCR error
  was not signaled via a machine check exception and instead was reported
  as a corrected machine check (CMC).

So we could merge this two case, and just remove the S=0 check for SRAO
in mce_severity().

---
v2: add OVER=0 check and merge MCE and CMC case.

Signed-off-by: Xie XiuQi 
Tested-by: Chen Wei 
Reviewed-by: Tony Luck 
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 26 +-
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c 
b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 4ca632a..5bbd06f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -59,6 +59,7 @@
 #define  MCGMASK(x, y) .mcgmask = x, .mcgres = y
 #define  MASK(x, y).mask = x, .result = y
 #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
+#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR)
 #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
 #defineMCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
 
@@ -101,6 +102,22 @@
NOSER, BITCLR(MCI_STATUS_UC)
),
 
+   /*
+* known AO MCACODs reported via MCE or CMC:
+*
+* SRAO could be signaled either via a machine check exception or
+* CMCI with the corresponding bit S 1 or 0. So we don't need to
+* check bit S for SRAO.
+*/
+   MCESEV(
+   AO, "Action optional: memory scrubbing error",
+   SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, 
MCI_STATUS_UC|MCACOD_SCRUB)
+   ),
+   MCESEV(
+   AO, "Action optional: last level cache writeback error",
+   SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, 
MCI_STATUS_UC|MCACOD_L3WB)
+   ),
+
/* ignore OVER for UCNA */
MCESEV(
UCNA, "Uncorrected no action required",
@@ -149,15 +166,6 @@
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
),
 
-   /* known AO MCACODs: */
-   MCESEV(
-   AO, "Action optional: memory scrubbing error",
-   SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, 
MCI_UC_S|MCACOD_SCRUB)
-   ),
-   MCESEV(
-   AO, "Action optional: last level cache writeback error",
-   SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, 
MCI_UC_S|MCACOD_L3WB)
-   ),
MCESEV(
SOME, "Action optional: unknown MCACOD",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
-- 
1.8.3.1

Re: [PATCH] x86/mce: add support SRAO reported via CMC check

2017-11-15 Thread Xie XiuQi

Hi Borislav, Tony,

On 2017/11/15 18:33, Borislav Petkov wrote:
> On Wed, Nov 15, 2017 at 02:44:07AM +, Luck, Tony wrote:
>> This code is subtle :-(
> 
> I'm glad that we agree on this! :-)
> 
> Anyone wanting to rewrite it yet?
> 

In Intel SDM Volume 3B (253669-063US, July 2017), SRAO could be
reported either via MCE or CMC:

  In cases when SRAO is signaled via CMCI the error signature is
  indicated via UC=1, PCC=0, S=0.

  Type(*1)  UC  EN  PCC S   AR  Signaling
  ---
  UC1   1   1   x   x   MCE
  SRAR  1   1   0   1   1   MCE
  SRAO  1   x(*2)   0   x(*2)   0   MCE/CMC
  UCNA  1   x   0   0   0   CMC
  CE0   x   x   x   x   CMC

  NOTES:
  1. SRAR, SRAO and UCNA errors are supported by the processor only
 when IA32_MCG_CAP[24] (MCG_SER_P) is set.
  2. EN=1, S=1 when signaled via MCE. EN=x, S=0 when signaled via CMC.

And there is a description in 15.6.2 UCR Error Reporting and Logging, for bit S:

  S (Signaling) flag, bit 56 - Indicates (when set) that a machine check
  exception was generated for the UCR error reported in this MC bank...
  When the S flag in the IA32_MCi_STATUS register is clear, this UCR error
  was not signaled via a machine check exception and instead was reported
  as a corrected machine check (CMC).

As the description in SDM, I think this flag could be used to determine whether
MCE or CMC was triggered. So we could merge this two case in one and just
remove the S=0 check for SRAO.

How about this patch?

>From a06b2a781a86e3b1fe241591b53f7a6d33d63331 Mon Sep 17 00:00:00 2001
From: Xie XiuQi <xiexi...@huawei.com>
Date: Tue, 14 Nov 2017 10:13:22 +0800
Subject: [PATCH] x86/mce: add support SRAO reported via CMC check

In Intel SDM Volume 3B (253669-063US, July 2017), SRAO could be
reported either via MCE or CMC:

  In cases when SRAO is signaled via CMCI the error signature is
  indicated via UC=1, PCC=0, S=0.

  Type(*1)  UC  EN  PCC S   AR  Signaling
  ---
  UC1   1   1   x   x   MCE
  SRAR  1   1   0   1   1   MCE
  SRAO  1   x(*2)   0   x(*2)   0   MCE/CMC
  UCNA  1   x   0   0   0   CMC
  CE0   x   x   x   x   CMC

  NOTES:
  1. SRAR, SRAO and UCNA errors are supported by the processor only
 when IA32_MCG_CAP[24] (MCG_SER_P) is set.
  2. EN=1, S=1 when signaled via MCE. EN=x, S=0 when signaled via CMC.

And there is a description in 15.6.2 UCR Error Reporting and Logging, for bit S:

  S (Signaling) flag, bit 56 - Indicates (when set) that a machine check
  exception was generated for the UCR error reported in this MC bank...
  When the S flag in the IA32_MCi_STATUS register is clear, this UCR error
  was not signaled via a machine check exception and instead was reported
  as a corrected machine check (CMC).

So we could merge this two case, and just remove the S=0 check for SRAO
in mce_severity().

Signed-off-by: Xie XiuQi <xiexi...@huawei.com>
Tested-by: Chen Wei <chenwe...@huawei.com>
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 26 +-
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c 
b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 4ca632a..5bbd06f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -59,6 +59,7 @@
 #define  MCGMASK(x, y) .mcgmask = x, .mcgres = y
 #define  MASK(x, y).mask = x, .result = y
 #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
+#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR)
 #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
 #defineMCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)

@@ -101,6 +102,22 @@
NOSER, BITCLR(MCI_STATUS_UC)
),

+   /*
+* known AO MCACODs reported via MCE or CMC:
+*
+* SRAO could be signaled either via a machine check exception or
+* CMCI with the corresponding bit S 1 or 0. So we don't need to
+* check bit S for SRAO.
+*/
+   MCESEV(
+   AO, "Action optional: memory scrubbing error",
+   SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, 
MCI_STATUS_UC|MCACOD_SCRUB)
+   ),
+   MCESEV(
+   AO, "Action optional: last level cache writeback error",
+   SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, 
MCI_STATUS_UC|MCACOD_L3WB)
+   ),
+
/* ignore OVER for UCNA */
MCESEV(
UCNA, "Uncorrected no action required",
@@ -149,15

Re: [PATCH] x86/mce: add support SRAO reported via CMC check

2017-11-15 Thread Xie XiuQi

Hi Borislav, Tony,

On 2017/11/15 18:33, Borislav Petkov wrote:
> On Wed, Nov 15, 2017 at 02:44:07AM +, Luck, Tony wrote:
>> This code is subtle :-(
> 
> I'm glad that we agree on this! :-)
> 
> Anyone wanting to rewrite it yet?
> 

In Intel SDM Volume 3B (253669-063US, July 2017), SRAO could be
reported either via MCE or CMC:

  In cases when SRAO is signaled via CMCI the error signature is
  indicated via UC=1, PCC=0, S=0.

  Type(*1)  UC  EN  PCC S   AR  Signaling
  ---
  UC1   1   1   x   x   MCE
  SRAR  1   1   0   1   1   MCE
  SRAO  1   x(*2)   0   x(*2)   0   MCE/CMC
  UCNA  1   x   0   0   0   CMC
  CE0   x   x   x   x   CMC

  NOTES:
  1. SRAR, SRAO and UCNA errors are supported by the processor only
 when IA32_MCG_CAP[24] (MCG_SER_P) is set.
  2. EN=1, S=1 when signaled via MCE. EN=x, S=0 when signaled via CMC.

And there is a description in 15.6.2 UCR Error Reporting and Logging, for bit S:

  S (Signaling) flag, bit 56 - Indicates (when set) that a machine check
  exception was generated for the UCR error reported in this MC bank...
  When the S flag in the IA32_MCi_STATUS register is clear, this UCR error
  was not signaled via a machine check exception and instead was reported
  as a corrected machine check (CMC).

As the description in SDM, I think this flag could be used to determine whether
MCE or CMC was triggered. So we could merge this two case in one and just
remove the S=0 check for SRAO.

How about this patch?

>From a06b2a781a86e3b1fe241591b53f7a6d33d63331 Mon Sep 17 00:00:00 2001
From: Xie XiuQi 
Date: Tue, 14 Nov 2017 10:13:22 +0800
Subject: [PATCH] x86/mce: add support SRAO reported via CMC check

In Intel SDM Volume 3B (253669-063US, July 2017), SRAO could be
reported either via MCE or CMC:

  In cases when SRAO is signaled via CMCI the error signature is
  indicated via UC=1, PCC=0, S=0.

  Type(*1)  UC  EN  PCC S   AR  Signaling
  ---
  UC1   1   1   x   x   MCE
  SRAR  1   1   0   1   1   MCE
  SRAO  1   x(*2)   0   x(*2)   0   MCE/CMC
  UCNA  1   x   0   0   0   CMC
  CE0   x   x   x   x   CMC

  NOTES:
  1. SRAR, SRAO and UCNA errors are supported by the processor only
 when IA32_MCG_CAP[24] (MCG_SER_P) is set.
  2. EN=1, S=1 when signaled via MCE. EN=x, S=0 when signaled via CMC.

And there is a description in 15.6.2 UCR Error Reporting and Logging, for bit S:

  S (Signaling) flag, bit 56 - Indicates (when set) that a machine check
  exception was generated for the UCR error reported in this MC bank...
  When the S flag in the IA32_MCi_STATUS register is clear, this UCR error
  was not signaled via a machine check exception and instead was reported
  as a corrected machine check (CMC).

So we could merge this two case, and just remove the S=0 check for SRAO
in mce_severity().

Signed-off-by: Xie XiuQi 
Tested-by: Chen Wei 
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 26 +-
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c 
b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 4ca632a..5bbd06f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -59,6 +59,7 @@
 #define  MCGMASK(x, y) .mcgmask = x, .mcgres = y
 #define  MASK(x, y).mask = x, .result = y
 #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
+#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR)
 #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
 #defineMCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)

@@ -101,6 +102,22 @@
NOSER, BITCLR(MCI_STATUS_UC)
),

+   /*
+* known AO MCACODs reported via MCE or CMC:
+*
+* SRAO could be signaled either via a machine check exception or
+* CMCI with the corresponding bit S 1 or 0. So we don't need to
+* check bit S for SRAO.
+*/
+   MCESEV(
+   AO, "Action optional: memory scrubbing error",
+   SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, 
MCI_STATUS_UC|MCACOD_SCRUB)
+   ),
+   MCESEV(
+   AO, "Action optional: last level cache writeback error",
+   SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, 
MCI_STATUS_UC|MCACOD_L3WB)
+   ),
+
/* ignore OVER for UCNA */
MCESEV(
UCNA, "Uncorrected no action required",
@@ -149,15 +166,6 @@
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
),

-   /* known A

Re: [PATCH] x86/mce: add support SRAO reported via CMC check

2017-11-14 Thread Xie XiuQi

Hi Tony,

On 2017/11/15 10:13, Luck, Tony wrote:
>> OK, I'll add check for OVER=0 in v2.
> 
> Thinking a bit more on this ... do you just need to remove the check
> for S=1 from the existing "AO" entries?  Or do we need the CMCI
> match entry early in the table?

Yes, we could just remove the check for S=1, and place "AO" entries before
"UCNA" entries.

Is that right?

> 
> -Tony
> --
> To unsubscribe from this list: send the line "unsubscribe linux-edac" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 

-- 
Thanks,
Xie XiuQi

Re: [PATCH] x86/mce: add support SRAO reported via CMC check

2017-11-14 Thread Xie XiuQi

Hi Tony,

On 2017/11/15 10:13, Luck, Tony wrote:
>> OK, I'll add check for OVER=0 in v2.
> 
> Thinking a bit more on this ... do you just need to remove the check
> for S=1 from the existing "AO" entries?  Or do we need the CMCI
> match entry early in the table?

Yes, we could just remove the check for S=1, and place "AO" entries before
"UCNA" entries.

Is that right?

> 
> -Tony
> --
> To unsubscribe from this list: send the line "unsubscribe linux-edac" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 

-- 
Thanks,
Xie XiuQi

Re: [PATCH] x86/mce: add support SRAO reported via CMC check

2017-11-14 Thread Xie XiuQi

Hi Tony,

On 2017/11/15 2:51, Luck, Tony wrote:
> On Tue, Nov 14, 2017 at 01:55:11PM +0800, Xie XiuQi wrote:
>> +/* known AO MCACODs reported via CMC: */
>> +MCESEV(
>> +AO, "Action optional: memory scrubbing error",
>> +SER, MASK(MCI_UC_SAR|MCACOD_SCRUBMSK, 
>> MCI_STATUS_UC|MCACOD_SCRUB)
> 
> I think you should check for OVER=0 (as the existing AO cases do).
> If there was a patrol scrub reported by CMCI, and then another UC
> error, we can't safely treat this as an AO ... because we have no
> idea what the second UC error was.

OK, I'll add check for OVER=0 in v2.

-- 
Thanks,
Xie XiuQi

> 
> -Tony
> 
> .
>

Re: [PATCH] x86/mce: add support SRAO reported via CMC check

2017-11-14 Thread Xie XiuQi

Hi Tony,

On 2017/11/15 2:51, Luck, Tony wrote:
> On Tue, Nov 14, 2017 at 01:55:11PM +0800, Xie XiuQi wrote:
>> +/* known AO MCACODs reported via CMC: */
>> +MCESEV(
>> +AO, "Action optional: memory scrubbing error",
>> +SER, MASK(MCI_UC_SAR|MCACOD_SCRUBMSK, 
>> MCI_STATUS_UC|MCACOD_SCRUB)
> 
> I think you should check for OVER=0 (as the existing AO cases do).
> If there was a patrol scrub reported by CMCI, and then another UC
> error, we can't safely treat this as an AO ... because we have no
> idea what the second UC error was.

OK, I'll add check for OVER=0 in v2.

-- 
Thanks,
Xie XiuQi

> 
> -Tony
> 
> .
>

[PATCH] x86/mce: add support SRAO reported via CMC check

2017-11-13 Thread Xie XiuQi

In Intel SDM Volume 3B (253669-063US, July 2017), SRAO could be
reported via CMC:

  In cases when SRAO is signaled via CMCI the error signature is
  indicated via UC=1, PCC=0, S=0.

So we add those known AO MCACODs check in mce_severity().

Signed-off-by: Xie XiuQi <xiexi...@huawei.com>
Tested-by: Chen Wei <chenwe...@huawei.com>
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c 
b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 4ca632a..48f239a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -101,6 +101,16 @@
NOSER, BITCLR(MCI_STATUS_UC)
),
 
+   /* known AO MCACODs reported via CMC: */
+   MCESEV(
+   AO, "Action optional: memory scrubbing error",
+   SER, MASK(MCI_UC_SAR|MCACOD_SCRUBMSK, 
MCI_STATUS_UC|MCACOD_SCRUB)
+   ),
+   MCESEV(
+   AO, "Action optional: last level cache writeback error",
+   SER, MASK(MCI_UC_SAR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB)
+   ),
+
/* ignore OVER for UCNA */
MCESEV(
UCNA, "Uncorrected no action required",
-- 
1.8.3.1

[PATCH] x86/mce: add support SRAO reported via CMC check

2017-11-13 Thread Xie XiuQi

In Intel SDM Volume 3B (253669-063US, July 2017), SRAO could be
reported via CMC:

  In cases when SRAO is signaled via CMCI the error signature is
  indicated via UC=1, PCC=0, S=0.

So we add those known AO MCACODs check in mce_severity().

Signed-off-by: Xie XiuQi 
Tested-by: Chen Wei 
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c 
b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 4ca632a..48f239a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -101,6 +101,16 @@
NOSER, BITCLR(MCI_STATUS_UC)
),
 
+   /* known AO MCACODs reported via CMC: */
+   MCESEV(
+   AO, "Action optional: memory scrubbing error",
+   SER, MASK(MCI_UC_SAR|MCACOD_SCRUBMSK, 
MCI_STATUS_UC|MCACOD_SCRUB)
+   ),
+   MCESEV(
+   AO, "Action optional: last level cache writeback error",
+   SER, MASK(MCI_UC_SAR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB)
+   ),
+
/* ignore OVER for UCNA */
MCESEV(
UCNA, "Uncorrected no action required",
-- 
1.8.3.1

[PATCH v4 0/4] arm64/ras: support sea error recovery

2017-09-27 Thread Xie XiuQi

With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
are consumed. According to the existing process, errors occurred in the
kernel, leading to direct panic, if it occurred the user-space, we should
just kill process.

But there is a class of error, in fact, is not necessary to kill
process, you can recover and continue to run the process. Such as
the instruction data corrupted, where the memory page might be
read-only, which is has not been modified, the disk might have the
correct data, so you can directly drop the page, ant reload it when
necessary.

So this patchset is just try to solve such problem: if the error is
consumed in user-space and the error occurs on a clean page, you can
directly drop the memory page without killing process.

If the corrupted page is clean, just dropped it and return to user-space
without side effects. And if corrupted page is dirty, memory_failure()
will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset,
do_sea() will just send SIGBUS, so the process was killed in the same place.

Because memory_failure() may sleep, we can not call it directly in SEA
exception context. So we saved faulting physical address associated with
a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return
from SEA exception context and get into do_notify_resume() before the
process running, we could check it and call memory_failure() to do
recovery. It's safe, because we are in process context.

In some platform, when SEA triggerred, physical address could be reported
by memory section or by processor section, so we save address at this two
place.

---
v4 - v3:
  - rebase on top of the latest mainline
  - make ghes_arm_process_error as a weak function
  - only pick cache error from arm processor section for error recovery
  - fix s-o-b issue

  https://lkml.org/lkml/2017/9/7/98

v3 - v2:
  - fix patch style issue

v2 - v1:
  - wrap arm_proc_error_check and log_arm_hw_error in a single 
arm_process_error()
  - fix sea_save_info return value issue
  - fix link error if this CONFIG_ARM64_ERR_RECOV is not selected
  - use a notify chain instead of call arch_apei_report_mem_error directly

  https://lkml.org/lkml/2017/9/1/189

Xie XiuQi (4):
  ACPI, CPER: add arm error info type definition
  arm64/ras: support sea error recovery
  GHES: add a notify chain for process memory section
  arm64/ras: save error address from memory section for recovery

 arch/arm64/Kconfig   |  11 +++
 arch/arm64/include/asm/ras.h |  23 +
 arch/arm64/include/asm/thread_info.h |   4 +-
 arch/arm64/kernel/Makefile   |   1 +
 arch/arm64/kernel/ras.c  | 173 +++
 arch/arm64/kernel/signal.c   |   7 ++
 arch/arm64/mm/fault.c|  27 --
 drivers/acpi/apei/ghes.c |  18 +++-
 include/acpi/ghes.h  |  11 +++
 include/linux/cper.h |   5 +
 10 files changed, 270 insertions(+), 10 deletions(-)
 create mode 100644 arch/arm64/include/asm/ras.h
 create mode 100644 arch/arm64/kernel/ras.c

-- 
1.8.3.1

[PATCH v4 4/4] arm64/ras: save error address from memory section for recovery

2017-09-27 Thread Xie XiuQi

In some platform, when SEA triggerred, physical address might be
reported by memory section, so we save it for error recovery later.

Signed-off-by: Xie XiuQi <xiexi...@huawei.com>
---
 arch/arm64/kernel/ras.c | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/arch/arm64/kernel/ras.c b/arch/arm64/kernel/ras.c
index c48503c..ff24181 100644
--- a/arch/arm64/kernel/ras.c
+++ b/arch/arm64/kernel/ras.c
@@ -140,3 +140,34 @@ void ghes_arm_process_error(struct ghes *ghes, struct 
cper_sec_proc_arm *err)
if (info_saved)
set_thread_flag(TIF_SEA_NOTIFY);
 }
+
+int ghes_mem_err_callback(struct notifier_block *nb, unsigned long val, void 
*data)
+{
+   bool info_saved = false;
+   struct ghes_mem_err *ghes_mem = (struct ghes_mem_err *)data;
+   struct cper_sec_mem_err *mem_err = ghes_mem->mem_err;
+
+   if ((ghes_mem->notify_type != ACPI_HEST_NOTIFY_SEA) ||
+   (ghes_mem->severity != CPER_SEV_RECOVERABLE))
+   return 0;
+
+   if (mem_err->validation_bits & CPER_MEM_VALID_PA)
+   info_saved = sea_save_info(mem_err->physical_addr);
+
+   if (info_saved)
+   set_thread_flag(TIF_SEA_NOTIFY);
+
+   return 0;
+}
+
+static struct notifier_block ghes_mem_err_nb = {
+   .notifier_call  = ghes_mem_err_callback,
+};
+
+static int arm64_err_recov_init(void)
+{
+   atomic_notifier_chain_register(_mem_err_chain, _mem_err_nb);
+   return 0;
+}
+
+late_initcall(arm64_err_recov_init);
-- 
1.8.3.1

Re: [PATCH v3 1/3] arm64/ras: support sea error recovery

2017-09-27 Thread Xie XiuQi

Hi James,

I'll send v4 to fix some small issue, for same one who interested
with this feature could test it. For the question you mentioned,
I will study in depth.

Any comments is welcome.

On 2017/9/16 2:33, James Morse wrote:
> Hi Xie XiuQi,
> 
> On 11/09/17 15:11, Xie XiuQi wrote:
>> I first describe the approach of this patchset:
>>
>> A memory access error on the execution path usually triggers SEA.
>> According to the existing process, errors occurred in the kernel,
>> leading to direct panic, if it occurred the user-space, we should
>> just kill process.
>>
>> But there is a class of error, in fact, is not necessary to kill
>> process, you can recover and continue to run the process. Such as
>> the instruction data corrupted, where the memory page might be
>> read-only, which is has not been modified, the disk might have the
>> correct data, so you can directly drop the page, ant reload it when
>> necessary.
>>
>> So this patchset is just try to solve such problem: if the error is
>> consumed in user-space and the error occurs on a clean page, you can
>> directly drop the memory page without killing process.
>>
>> This is implemented in memory_failure, which is generic process.
> 
>> The error reported by SEA should be handled before re-enter the process,
>> or we must kill the process to prevent error propagation.
>>
>> memory_failure_queue() is asynchronous, in which, error info was saved
>> at ghes_proc, but handled in kworker. During this period there is a context
>> switching, so we can not determine which process would be switch to. So
>> memory_failure_queue is not suitable for handling the problem.
> 
> Thanks for this summary. I see the problem you're trying to solve is when
> memory_failure() runs, in your scenario its not guaranteed to run before we
> return to user space.
> 
> What is the user-visible symptom of this? SIGBUS, code=0 instead of SIGBUS,
> code=...MCEERR_A?
> 
> ..in which case I'm looking at this as a race with the memory_failure() bottom
> half via schedule_work().
> 
> How does x86 avoid this same problem?
> 
> 
>> And memory_failure is not nmi-safe, so it can not be called directly in the
>> SEA context. So we just handle this error at SEA exit path, and before 
>> context
>> switching.
> 
> (I need to look more into which locks memory_failure() is taking)
> 
> 
>> In FFH mode, physical address can only be obtained by parsing the GHES table.
>> But we only care about SEA, so the error handling is tied to the type of 
>> notification.
> 
> I care about all the notification methods. Once the notification has been 
> passed
> to APEI I want them to behave the same so that we don't have subtle bugs 
> between
> the 11 different ways we could get a notification. This code is rarely tested
> enough as it is.
> 
>>From the arch code I just want to call out to APEI asking 'is this yours?'. If
> so I expect APEI to have done all the work, if not we take the v8.0 behaviour.
> 
> 
> Here you need APEI and the arch code to spot 'SEA' and treat it differently,
> invoking some arm64-specific behaviour for APEI, and some
> not-really-arch-specific code under /arch/arm64. There is nothing arm64 
> specific
> about your arm_process_error(), how come the core APEI code doesn't need to 
> do this?
> 
> 
> I think this is caused by the way memory_failure() schedules its work, and 
> that
> is where I'd like to try and fix this, so that its the same for all 
> notification
> methods and all (cough: both) architectures.
> 
> 
>> The TIF flag is checked on a generic path, but it will only be set when SEA 
>> occurs.
>> And if we use unlikely optimization, it should have little impact on 
>> performance.
> 
> Yes, the arch code checks _TIF_WORK_MASK in one go so there is no performance
> problem for code that hasn't taken the RAS-Error. (and once we've taken a RAS
> error performance is out the window!)
> 
> 
>> And the TIF flag approach was used on x86 platform for years, until commit 
>> d4812e169d
> 
> ... so x86 doesn't do this ...
> 
>> (x86, mce: Get rid of TIF_MCE_NOTIFY and associated mce tricks)[0]. On 
>> currently arm64
>> platform, there is no IST interrupt[1] function, so we could not call 
>> memory_failure
>> directly in SEA context. So the way to use TIF notification, is also a good 
>> choice,
>> after all, the same way on x86 platform is verified.
> 
> Thanks, looks like I need to read more of the history of x86's kernel-first
> handling...
> 
> 
> Thanks,
> 
> James
> 
> 
> .
> 

-- 
Thanks,
Xie XiuQi

[PATCH v4 0/4] arm64/ras: support sea error recovery

2017-09-27 Thread Xie XiuQi

With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
are consumed. According to the existing process, errors occurred in the
kernel, leading to direct panic, if it occurred the user-space, we should
just kill process.

But there is a class of error, in fact, is not necessary to kill
process, you can recover and continue to run the process. Such as
the instruction data corrupted, where the memory page might be
read-only, which is has not been modified, the disk might have the
correct data, so you can directly drop the page, ant reload it when
necessary.

So this patchset is just try to solve such problem: if the error is
consumed in user-space and the error occurs on a clean page, you can
directly drop the memory page without killing process.

If the corrupted page is clean, just dropped it and return to user-space
without side effects. And if corrupted page is dirty, memory_failure()
will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset,
do_sea() will just send SIGBUS, so the process was killed in the same place.

Because memory_failure() may sleep, we can not call it directly in SEA
exception context. So we saved faulting physical address associated with
a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return
from SEA exception context and get into do_notify_resume() before the
process running, we could check it and call memory_failure() to do
recovery. It's safe, because we are in process context.

In some platform, when SEA triggerred, physical address could be reported
by memory section or by processor section, so we save address at this two
place.

---
v4 - v3:
  - rebase on top of the latest mainline
  - make ghes_arm_process_error as a weak function
  - only pick cache error from arm processor section for error recovery
  - fix s-o-b issue

  https://lkml.org/lkml/2017/9/7/98

v3 - v2:
  - fix patch style issue

v2 - v1:
  - wrap arm_proc_error_check and log_arm_hw_error in a single 
arm_process_error()
  - fix sea_save_info return value issue
  - fix link error if this CONFIG_ARM64_ERR_RECOV is not selected
  - use a notify chain instead of call arch_apei_report_mem_error directly

  https://lkml.org/lkml/2017/9/1/189

Xie XiuQi (4):
  ACPI, CPER: add arm error info type definition
  arm64/ras: support sea error recovery
  GHES: add a notify chain for process memory section
  arm64/ras: save error address from memory section for recovery

 arch/arm64/Kconfig   |  11 +++
 arch/arm64/include/asm/ras.h |  23 +
 arch/arm64/include/asm/thread_info.h |   4 +-
 arch/arm64/kernel/Makefile   |   1 +
 arch/arm64/kernel/ras.c  | 173 +++
 arch/arm64/kernel/signal.c   |   7 ++
 arch/arm64/mm/fault.c|  27 --
 drivers/acpi/apei/ghes.c |  18 +++-
 include/acpi/ghes.h  |  11 +++
 include/linux/cper.h |   5 +
 10 files changed, 270 insertions(+), 10 deletions(-)
 create mode 100644 arch/arm64/include/asm/ras.h
 create mode 100644 arch/arm64/kernel/ras.c

-- 
1.8.3.1

[PATCH v4 4/4] arm64/ras: save error address from memory section for recovery

2017-09-27 Thread Xie XiuQi

In some platform, when SEA triggerred, physical address might be
reported by memory section, so we save it for error recovery later.

Signed-off-by: Xie XiuQi 
---
 arch/arm64/kernel/ras.c | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/arch/arm64/kernel/ras.c b/arch/arm64/kernel/ras.c
index c48503c..ff24181 100644
--- a/arch/arm64/kernel/ras.c
+++ b/arch/arm64/kernel/ras.c
@@ -140,3 +140,34 @@ void ghes_arm_process_error(struct ghes *ghes, struct 
cper_sec_proc_arm *err)
if (info_saved)
set_thread_flag(TIF_SEA_NOTIFY);
 }
+
+int ghes_mem_err_callback(struct notifier_block *nb, unsigned long val, void 
*data)
+{
+   bool info_saved = false;
+   struct ghes_mem_err *ghes_mem = (struct ghes_mem_err *)data;
+   struct cper_sec_mem_err *mem_err = ghes_mem->mem_err;
+
+   if ((ghes_mem->notify_type != ACPI_HEST_NOTIFY_SEA) ||
+   (ghes_mem->severity != CPER_SEV_RECOVERABLE))
+   return 0;
+
+   if (mem_err->validation_bits & CPER_MEM_VALID_PA)
+   info_saved = sea_save_info(mem_err->physical_addr);
+
+   if (info_saved)
+   set_thread_flag(TIF_SEA_NOTIFY);
+
+   return 0;
+}
+
+static struct notifier_block ghes_mem_err_nb = {
+   .notifier_call  = ghes_mem_err_callback,
+};
+
+static int arm64_err_recov_init(void)
+{
+   atomic_notifier_chain_register(_mem_err_chain, _mem_err_nb);
+   return 0;
+}
+
+late_initcall(arm64_err_recov_init);
-- 
1.8.3.1

Re: [PATCH v3 1/3] arm64/ras: support sea error recovery

2017-09-27 Thread Xie XiuQi

Hi James,

I'll send v4 to fix some small issue, for same one who interested
with this feature could test it. For the question you mentioned,
I will study in depth.

Any comments is welcome.

On 2017/9/16 2:33, James Morse wrote:
> Hi Xie XiuQi,
> 
> On 11/09/17 15:11, Xie XiuQi wrote:
>> I first describe the approach of this patchset:
>>
>> A memory access error on the execution path usually triggers SEA.
>> According to the existing process, errors occurred in the kernel,
>> leading to direct panic, if it occurred the user-space, we should
>> just kill process.
>>
>> But there is a class of error, in fact, is not necessary to kill
>> process, you can recover and continue to run the process. Such as
>> the instruction data corrupted, where the memory page might be
>> read-only, which is has not been modified, the disk might have the
>> correct data, so you can directly drop the page, ant reload it when
>> necessary.
>>
>> So this patchset is just try to solve such problem: if the error is
>> consumed in user-space and the error occurs on a clean page, you can
>> directly drop the memory page without killing process.
>>
>> This is implemented in memory_failure, which is generic process.
> 
>> The error reported by SEA should be handled before re-enter the process,
>> or we must kill the process to prevent error propagation.
>>
>> memory_failure_queue() is asynchronous, in which, error info was saved
>> at ghes_proc, but handled in kworker. During this period there is a context
>> switching, so we can not determine which process would be switch to. So
>> memory_failure_queue is not suitable for handling the problem.
> 
> Thanks for this summary. I see the problem you're trying to solve is when
> memory_failure() runs, in your scenario its not guaranteed to run before we
> return to user space.
> 
> What is the user-visible symptom of this? SIGBUS, code=0 instead of SIGBUS,
> code=...MCEERR_A?
> 
> ..in which case I'm looking at this as a race with the memory_failure() bottom
> half via schedule_work().
> 
> How does x86 avoid this same problem?
> 
> 
>> And memory_failure is not nmi-safe, so it can not be called directly in the
>> SEA context. So we just handle this error at SEA exit path, and before 
>> context
>> switching.
> 
> (I need to look more into which locks memory_failure() is taking)
> 
> 
>> In FFH mode, physical address can only be obtained by parsing the GHES table.
>> But we only care about SEA, so the error handling is tied to the type of 
>> notification.
> 
> I care about all the notification methods. Once the notification has been 
> passed
> to APEI I want them to behave the same so that we don't have subtle bugs 
> between
> the 11 different ways we could get a notification. This code is rarely tested
> enough as it is.
> 
>>From the arch code I just want to call out to APEI asking 'is this yours?'. If
> so I expect APEI to have done all the work, if not we take the v8.0 behaviour.
> 
> 
> Here you need APEI and the arch code to spot 'SEA' and treat it differently,
> invoking some arm64-specific behaviour for APEI, and some
> not-really-arch-specific code under /arch/arm64. There is nothing arm64 
> specific
> about your arm_process_error(), how come the core APEI code doesn't need to 
> do this?
> 
> 
> I think this is caused by the way memory_failure() schedules its work, and 
> that
> is where I'd like to try and fix this, so that its the same for all 
> notification
> methods and all (cough: both) architectures.
> 
> 
>> The TIF flag is checked on a generic path, but it will only be set when SEA 
>> occurs.
>> And if we use unlikely optimization, it should have little impact on 
>> performance.
> 
> Yes, the arch code checks _TIF_WORK_MASK in one go so there is no performance
> problem for code that hasn't taken the RAS-Error. (and once we've taken a RAS
> error performance is out the window!)
> 
> 
>> And the TIF flag approach was used on x86 platform for years, until commit 
>> d4812e169d
> 
> ... so x86 doesn't do this ...
> 
>> (x86, mce: Get rid of TIF_MCE_NOTIFY and associated mce tricks)[0]. On 
>> currently arm64
>> platform, there is no IST interrupt[1] function, so we could not call 
>> memory_failure
>> directly in SEA context. So the way to use TIF notification, is also a good 
>> choice,
>> after all, the same way on x86 platform is verified.
> 
> Thanks, looks like I need to read more of the history of x86's kernel-first
> handling...
> 
> 
> Thanks,
> 
> James
> 
> 
> .
> 

-- 
Thanks,
Xie XiuQi

[PATCH v4 1/4] ACPI, CPER: add arm error info type definition

2017-09-27 Thread Xie XiuQi

Add arm error info type definition according to ACPI6.1
Table 261. ARM Processor Error Information Structure,
which is used for error recovery in the following patches.

Signed-off-by: Xie XiuQi <xiexi...@huawei.com>
---
 include/linux/cper.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/include/linux/cper.h b/include/linux/cper.h
index 723e952..9106e2b 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -275,6 +275,11 @@ enum {
 #define CPER_ARM_INFO_FLAGS_PROPAGATED BIT(2)
 #define CPER_ARM_INFO_FLAGS_OVERFLOW   BIT(3)
 
+#define CPER_ARM_INFO_TYPE_CACHE   0
+#define CPER_ARM_INFO_TYPE_TLB 1
+#define CPER_ARM_INFO_TYPE_BUS 2
+#define CPER_ARM_INFO_TYPE_UARCH   3
+
 /*
  * All tables and structs must be byte-packed to match CPER
  * specification, since the tables are provided by the system BIOS
-- 
1.8.3.1

[PATCH v4 2/4] arm64/ras: support sea error recovery

2017-09-27 Thread Xie XiuQi

With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
are consumed. According to the existing process, errors occurred in the
kernel, leading to direct panic, if it occurred the user-space, we should
just kill process.

But there is a class of error, in fact, is not necessary to kill
process, you can recover and continue to run the process. Such as
the instruction data corrupted, where the memory page might be
read-only, which is has not been modified, the disk might have the
correct data, so you can directly drop the page, ant reload it when
necessary.

So this patchset is just try to solve such problem: if the error is
consumed in user-space and the error occurs on a clean page, you can
directly drop the memory page without killing process.

If the corrupted page is clean, just dropped it and return to user-space
without side effects. And if corrupted page is dirty, memory_failure()
will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset,
do_sea() will just send SIGBUS, so the process was killed in the same place.

Because memory_failure() may sleep, we can not call it directly in SEA
exception context. So we saved faulting physical address associated with
a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return
from SEA exception context and get into do_notify_resume() before the
process running, we could check it and call memory_failure() to do
recovery. It's safe, because we are in process context.

Signed-off-by: Xie XiuQi <xiexi...@huawei.com>
Tested-by: Wang Xiongfeng <wangxiongfe...@huawei.com>
Cc: James Morse <james.mo...@arm.com>
Cc: Borislav Petkov <b...@suse.de>
Cc: Julien Thierry <julien.thie...@arm.com>
Cc: Hanjun Guo <hanjun@linaro.org>
---
 arch/arm64/Kconfig   |  11 +++
 arch/arm64/include/asm/ras.h |  23 ++
 arch/arm64/include/asm/thread_info.h |   4 +-
 arch/arm64/kernel/Makefile   |   1 +
 arch/arm64/kernel/ras.c  | 142 +++
 arch/arm64/kernel/signal.c   |   7 ++
 arch/arm64/mm/fault.c|  27 +--
 drivers/acpi/apei/ghes.c |   8 +-
 include/acpi/ghes.h  |   3 +
 9 files changed, 216 insertions(+), 10 deletions(-)
 create mode 100644 arch/arm64/include/asm/ras.h
 create mode 100644 arch/arm64/kernel/ras.c

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 0df64a6..0c2d0a0 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -641,6 +641,17 @@ config HOTPLUG_CPU
  Say Y here to experiment with turning CPUs off and on.  CPUs
  can be controlled through /sys/devices/system/cpu.
 
+config ARM64_ERR_RECOV
+   bool "Support arm64 RAS error recovery"
+   depends on ACPI_APEI_SEA && MEMORY_FAILURE
+   help
+ With ARM v8.2 RAS Extension, SEA are usually triggered when memory 
errors
+ are consumed. In some cases, if the error address is in a clean page 
or a
+ read-only page, there is a chance to recover. Such as error occurs in 
a
+ instruction page, we can reread this page from disk instead of 
killing process.
+
+ Say Y if unsure.
+
 # Common NUMA Features
 config NUMA
bool "Numa Memory Allocation and Scheduler Support"
diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h
new file mode 100644
index 000..f0f18da
--- /dev/null
+++ b/arch/arm64/include/asm/ras.h
@@ -0,0 +1,23 @@
+/*
+ * ARM64 SEA error recoery support
+ *
+ * Copyright 2017 Huawei Technologies Co., Ltd.
+ *   Author: Xie XiuQi <xiexi...@huawei.com>
+ *   Author: Wang Xiongfeng <wangxiongfe...@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _ASM_RAS_H
+#define _ASM_RAS_H
+
+extern void sea_notify_process(void);
+
+#endif /*_ASM_RAS_H*/
diff --git a/arch/arm64/include/asm/thread_info.h 
b/arch/arm64/include/asm/thread_info.h
index ddded64..b31b308 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -82,6 +82,7 @@ struct thread_info {
 #define TIF_FOREIGN_FPSTATE3   /* CPU's FP state is not current's */
 #define TIF_UPROBE 4   /* uprobe breakpoint or singlestep */
 #define TIF_FSCHECK5   /* Check FS is USER_DS on return */
+#define TIF_SEA_NOTIFY  6   /* notify to do an error recovery */
 #define TIF_NOHZ   7
 #define TIF_SYSCALL_TRACE  8
 #define TIF_SYSCALL_AUDIT  9
@@ -98,6 +99,7 @@ struct thread_info {
 #define _TIF_N

[PATCH v4 3/4] GHES: add a notify chain for process memory section

2017-09-27 Thread Xie XiuQi

Add a notify chain for process memory section, with
which other modules might do error recovery.

Signed-off-by: Xie XiuQi <xiexi...@huawei.com>
---
 drivers/acpi/apei/ghes.c | 10 ++
 include/acpi/ghes.h  |  8 
 2 files changed, 18 insertions(+)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 2abf673..56b7336 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -108,6 +108,9 @@ static inline bool is_hest_type_generic_v2(struct ghes 
*ghes)
 static LIST_HEAD(ghes_hed);
 static DEFINE_MUTEX(ghes_list_mutex);
 
+ATOMIC_NOTIFIER_HEAD(ghes_mem_err_chain);
+EXPORT_SYMBOL(ghes_mem_err_chain);
+
 /*
  * Because the memory area used to transfer hardware error information
  * from BIOS to Linux can be determined only in NMI, IRQ or timer
@@ -485,6 +488,13 @@ static void ghes_do_proc(struct ghes *ghes,
 
if (guid_equal(sec_type, _SEC_PLATFORM_MEM)) {
struct cper_sec_mem_err *mem_err = 
acpi_hest_get_payload(gdata);
+   struct ghes_mem_err mem;
+
+   mem.notify_type = ghes->generic->notify.type;
+   mem.severity = gdata->error_severity;
+   mem.mem_err = mem_err;
+
+   atomic_notifier_call_chain(_mem_err_chain, 0, 
);
 
ghes_edac_report_mem_error(ghes, sev, mem_err);
 
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 8bcb8e5..ab51e40 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -123,4 +123,12 @@ static inline void *acpi_hest_get_next(struct 
acpi_hest_generic_data *gdata)
 extern void ghes_arm_process_error(struct ghes *ghes,
   struct cper_sec_proc_arm *err);
 
+struct ghes_mem_err {
+   int notify_type;
+   int severity;
+   struct cper_sec_mem_err *mem_err;
+};
+
+extern struct atomic_notifier_head ghes_mem_err_chain;
+
 #endif /* GHES_H */
-- 
1.8.3.1

[PATCH v4 1/4] ACPI, CPER: add arm error info type definition

2017-09-27 Thread Xie XiuQi

Add arm error info type definition according to ACPI6.1
Table 261. ARM Processor Error Information Structure,
which is used for error recovery in the following patches.

Signed-off-by: Xie XiuQi 
---
 include/linux/cper.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/include/linux/cper.h b/include/linux/cper.h
index 723e952..9106e2b 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -275,6 +275,11 @@ enum {
 #define CPER_ARM_INFO_FLAGS_PROPAGATED BIT(2)
 #define CPER_ARM_INFO_FLAGS_OVERFLOW   BIT(3)
 
+#define CPER_ARM_INFO_TYPE_CACHE   0
+#define CPER_ARM_INFO_TYPE_TLB 1
+#define CPER_ARM_INFO_TYPE_BUS 2
+#define CPER_ARM_INFO_TYPE_UARCH   3
+
 /*
  * All tables and structs must be byte-packed to match CPER
  * specification, since the tables are provided by the system BIOS
-- 
1.8.3.1

[PATCH v4 2/4] arm64/ras: support sea error recovery

2017-09-27 Thread Xie XiuQi

With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
are consumed. According to the existing process, errors occurred in the
kernel, leading to direct panic, if it occurred the user-space, we should
just kill process.

But there is a class of error, in fact, is not necessary to kill
process, you can recover and continue to run the process. Such as
the instruction data corrupted, where the memory page might be
read-only, which is has not been modified, the disk might have the
correct data, so you can directly drop the page, ant reload it when
necessary.

So this patchset is just try to solve such problem: if the error is
consumed in user-space and the error occurs on a clean page, you can
directly drop the memory page without killing process.

If the corrupted page is clean, just dropped it and return to user-space
without side effects. And if corrupted page is dirty, memory_failure()
will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset,
do_sea() will just send SIGBUS, so the process was killed in the same place.

Because memory_failure() may sleep, we can not call it directly in SEA
exception context. So we saved faulting physical address associated with
a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return
from SEA exception context and get into do_notify_resume() before the
process running, we could check it and call memory_failure() to do
recovery. It's safe, because we are in process context.

Signed-off-by: Xie XiuQi 
Tested-by: Wang Xiongfeng 
Cc: James Morse 
Cc: Borislav Petkov 
Cc: Julien Thierry 
Cc: Hanjun Guo 
---
 arch/arm64/Kconfig   |  11 +++
 arch/arm64/include/asm/ras.h |  23 ++
 arch/arm64/include/asm/thread_info.h |   4 +-
 arch/arm64/kernel/Makefile   |   1 +
 arch/arm64/kernel/ras.c  | 142 +++
 arch/arm64/kernel/signal.c   |   7 ++
 arch/arm64/mm/fault.c|  27 +--
 drivers/acpi/apei/ghes.c |   8 +-
 include/acpi/ghes.h  |   3 +
 9 files changed, 216 insertions(+), 10 deletions(-)
 create mode 100644 arch/arm64/include/asm/ras.h
 create mode 100644 arch/arm64/kernel/ras.c

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 0df64a6..0c2d0a0 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -641,6 +641,17 @@ config HOTPLUG_CPU
  Say Y here to experiment with turning CPUs off and on.  CPUs
  can be controlled through /sys/devices/system/cpu.
 
+config ARM64_ERR_RECOV
+   bool "Support arm64 RAS error recovery"
+   depends on ACPI_APEI_SEA && MEMORY_FAILURE
+   help
+ With ARM v8.2 RAS Extension, SEA are usually triggered when memory 
errors
+ are consumed. In some cases, if the error address is in a clean page 
or a
+ read-only page, there is a chance to recover. Such as error occurs in 
a
+ instruction page, we can reread this page from disk instead of 
killing process.
+
+ Say Y if unsure.
+
 # Common NUMA Features
 config NUMA
bool "Numa Memory Allocation and Scheduler Support"
diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h
new file mode 100644
index 000..f0f18da
--- /dev/null
+++ b/arch/arm64/include/asm/ras.h
@@ -0,0 +1,23 @@
+/*
+ * ARM64 SEA error recoery support
+ *
+ * Copyright 2017 Huawei Technologies Co., Ltd.
+ *   Author: Xie XiuQi 
+ *   Author: Wang Xiongfeng 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _ASM_RAS_H
+#define _ASM_RAS_H
+
+extern void sea_notify_process(void);
+
+#endif /*_ASM_RAS_H*/
diff --git a/arch/arm64/include/asm/thread_info.h 
b/arch/arm64/include/asm/thread_info.h
index ddded64..b31b308 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -82,6 +82,7 @@ struct thread_info {
 #define TIF_FOREIGN_FPSTATE3   /* CPU's FP state is not current's */
 #define TIF_UPROBE 4   /* uprobe breakpoint or singlestep */
 #define TIF_FSCHECK5   /* Check FS is USER_DS on return */
+#define TIF_SEA_NOTIFY  6   /* notify to do an error recovery */
 #define TIF_NOHZ   7
 #define TIF_SYSCALL_TRACE  8
 #define TIF_SYSCALL_AUDIT  9
@@ -98,6 +99,7 @@ struct thread_info {
 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
 #define _TIF_FOREIGN_FPSTATE   (1 << TIF_FOREIGN_FPSTATE)
 #define _TIF_NOHZ  (1 << TIF_NOHZ)
+#define _TIF_SEA_NOTIFY (1 << TIF_SEA_NOTIFY)
 #d

[PATCH v4 3/4] GHES: add a notify chain for process memory section

2017-09-27 Thread Xie XiuQi

Add a notify chain for process memory section, with
which other modules might do error recovery.

Signed-off-by: Xie XiuQi 
---
 drivers/acpi/apei/ghes.c | 10 ++
 include/acpi/ghes.h  |  8 
 2 files changed, 18 insertions(+)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 2abf673..56b7336 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -108,6 +108,9 @@ static inline bool is_hest_type_generic_v2(struct ghes 
*ghes)
 static LIST_HEAD(ghes_hed);
 static DEFINE_MUTEX(ghes_list_mutex);
 
+ATOMIC_NOTIFIER_HEAD(ghes_mem_err_chain);
+EXPORT_SYMBOL(ghes_mem_err_chain);
+
 /*
  * Because the memory area used to transfer hardware error information
  * from BIOS to Linux can be determined only in NMI, IRQ or timer
@@ -485,6 +488,13 @@ static void ghes_do_proc(struct ghes *ghes,
 
if (guid_equal(sec_type, _SEC_PLATFORM_MEM)) {
struct cper_sec_mem_err *mem_err = 
acpi_hest_get_payload(gdata);
+   struct ghes_mem_err mem;
+
+   mem.notify_type = ghes->generic->notify.type;
+   mem.severity = gdata->error_severity;
+   mem.mem_err = mem_err;
+
+   atomic_notifier_call_chain(_mem_err_chain, 0, 
);
 
ghes_edac_report_mem_error(ghes, sev, mem_err);
 
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 8bcb8e5..ab51e40 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -123,4 +123,12 @@ static inline void *acpi_hest_get_next(struct 
acpi_hest_generic_data *gdata)
 extern void ghes_arm_process_error(struct ghes *ghes,
   struct cper_sec_proc_arm *err);
 
+struct ghes_mem_err {
+   int notify_type;
+   int severity;
+   struct cper_sec_mem_err *mem_err;
+};
+
+extern struct atomic_notifier_head ghes_mem_err_chain;
+
 #endif /* GHES_H */
-- 
1.8.3.1

Re: [PATCH v3 1/3] arm64/ras: support sea error recovery

2017-09-20 Thread Xie XiuQi

Hi James,

I try to explain some of following questions.

On 2017/9/16 2:33, James Morse wrote:
> Hi Xie XiuQi,
> 
> On 11/09/17 15:11, Xie XiuQi wrote:
>> I first describe the approach of this patchset:
>>
>> A memory access error on the execution path usually triggers SEA.
>> According to the existing process, errors occurred in the kernel,
>> leading to direct panic, if it occurred the user-space, we should
>> just kill process.
>>
>> But there is a class of error, in fact, is not necessary to kill
>> process, you can recover and continue to run the process. Such as
>> the instruction data corrupted, where the memory page might be
>> read-only, which is has not been modified, the disk might have the
>> correct data, so you can directly drop the page, ant reload it when
>> necessary.
>>
>> So this patchset is just try to solve such problem: if the error is
>> consumed in user-space and the error occurs on a clean page, you can
>> directly drop the memory page without killing process.
>>
>> This is implemented in memory_failure, which is generic process.
> 
>> The error reported by SEA should be handled before re-enter the process,
>> or we must kill the process to prevent error propagation.
>>
>> memory_failure_queue() is asynchronous, in which, error info was saved
>> at ghes_proc, but handled in kworker. During this period there is a context
>> switching, so we can not determine which process would be switch to. So
>> memory_failure_queue is not suitable for handling the problem.
> 
> Thanks for this summary. I see the problem you're trying to solve is when
> memory_failure() runs, in your scenario its not guaranteed to run before we
> return to user space.
> 
> What is the user-visible symptom of this? SIGBUS, code=0 instead of SIGBUS,
> code=...MCEERR_A?

If the corrupted page is clean, just dropped it and return to user-space without
side effects. And if corrupted page is dirty, memory_failure() will send SIGBUS
with code=BUS_MCEERR_AR.

> 
> ..in which case I'm looking at this as a race with the memory_failure() bottom
> half via schedule_work().

Do you mean there is a race between memory_failure() from do_notify_resume() and
from schedule_work()? Indeed there a race here, thank you for pointing out that.

Actually, memory_failure_queue() is not needed in SEA case if we process it in
synchronous.

I'll try to solve this issue later.

> 
> How does x86 avoid this same problem?
> 
> 
>> And memory_failure is not nmi-safe, so it can not be called directly in the
>> SEA context. So we just handle this error at SEA exit path, and before 
>> context
>> switching.
> 
> (I need to look more into which locks memory_failure() is taking)
> 
> 
>> In FFH mode, physical address can only be obtained by parsing the GHES table.
>> But we only care about SEA, so the error handling is tied to the type of 
>> notification.
> 
> I care about all the notification methods. Once the notification has been 
> passed
> to APEI I want them to behave the same so that we don't have subtle bugs 
> between
> the 11 different ways we could get a notification. This code is rarely tested
> enough as it is.
> 
>>From the arch code I just want to call out to APEI asking 'is this yours?'. If
> so I expect APEI to have done all the work, if not we take the v8.0 behaviour.
> 
> 
> Here you need APEI and the arch code to spot 'SEA' and treat it differently,
> invoking some arm64-specific behaviour for APEI, and some
> not-really-arch-specific code under /arch/arm64. There is nothing arm64 
> specific
> about your arm_process_error(), how come the core APEI code doesn't need to 
> do this?
> 
> 
> I think this is caused by the way memory_failure() schedules its work, and 
> that
> is where I'd like to try and fix this, so that its the same for all 
> notification
> methods and all (cough: both) architectures.

I try to see if there is a better way.

> 
> 
>> The TIF flag is checked on a generic path, but it will only be set when SEA 
>> occurs.
>> And if we use unlikely optimization, it should have little impact on 
>> performance.
> 
> Yes, the arch code checks _TIF_WORK_MASK in one go so there is no performance
> problem for code that hasn't taken the RAS-Error. (and once we've taken a RAS
> error performance is out the window!)
> 
>> And the TIF flag approach was used on x86 platform for years, until commit 
>> d4812e169d
> 
> ... so x86 doesn't do this ...
> 
>> (x86, mce: Get rid of TIF_MCE_NOTIFY and associated mce tricks)[0]. On 
>> currently arm64
>> platform, there is no IST interrupt[1] function, so we could not call 
>> memory_failure
>> directly in SEA context. So the way to use TIF notification, is also a good 
>> choice,
>> after all, the same way on x86 platform is verified.
> 
> Thanks, looks like I need to read more of the history of x86's kernel-first
> handling...
> 
> 
> Thanks,
> 
> James
> 
> 
> .
> 

-- 
Thanks,
Xie XiuQi

Re: [PATCH v3 1/3] arm64/ras: support sea error recovery

2017-09-20 Thread Xie XiuQi

Hi James,

I try to explain some of following questions.

On 2017/9/16 2:33, James Morse wrote:
> Hi Xie XiuQi,
> 
> On 11/09/17 15:11, Xie XiuQi wrote:
>> I first describe the approach of this patchset:
>>
>> A memory access error on the execution path usually triggers SEA.
>> According to the existing process, errors occurred in the kernel,
>> leading to direct panic, if it occurred the user-space, we should
>> just kill process.
>>
>> But there is a class of error, in fact, is not necessary to kill
>> process, you can recover and continue to run the process. Such as
>> the instruction data corrupted, where the memory page might be
>> read-only, which is has not been modified, the disk might have the
>> correct data, so you can directly drop the page, ant reload it when
>> necessary.
>>
>> So this patchset is just try to solve such problem: if the error is
>> consumed in user-space and the error occurs on a clean page, you can
>> directly drop the memory page without killing process.
>>
>> This is implemented in memory_failure, which is generic process.
> 
>> The error reported by SEA should be handled before re-enter the process,
>> or we must kill the process to prevent error propagation.
>>
>> memory_failure_queue() is asynchronous, in which, error info was saved
>> at ghes_proc, but handled in kworker. During this period there is a context
>> switching, so we can not determine which process would be switch to. So
>> memory_failure_queue is not suitable for handling the problem.
> 
> Thanks for this summary. I see the problem you're trying to solve is when
> memory_failure() runs, in your scenario its not guaranteed to run before we
> return to user space.
> 
> What is the user-visible symptom of this? SIGBUS, code=0 instead of SIGBUS,
> code=...MCEERR_A?

If the corrupted page is clean, just dropped it and return to user-space without
side effects. And if corrupted page is dirty, memory_failure() will send SIGBUS
with code=BUS_MCEERR_AR.

> 
> ..in which case I'm looking at this as a race with the memory_failure() bottom
> half via schedule_work().

Do you mean there is a race between memory_failure() from do_notify_resume() and
from schedule_work()? Indeed there a race here, thank you for pointing out that.

Actually, memory_failure_queue() is not needed in SEA case if we process it in
synchronous.

I'll try to solve this issue later.

> 
> How does x86 avoid this same problem?
> 
> 
>> And memory_failure is not nmi-safe, so it can not be called directly in the
>> SEA context. So we just handle this error at SEA exit path, and before 
>> context
>> switching.
> 
> (I need to look more into which locks memory_failure() is taking)
> 
> 
>> In FFH mode, physical address can only be obtained by parsing the GHES table.
>> But we only care about SEA, so the error handling is tied to the type of 
>> notification.
> 
> I care about all the notification methods. Once the notification has been 
> passed
> to APEI I want them to behave the same so that we don't have subtle bugs 
> between
> the 11 different ways we could get a notification. This code is rarely tested
> enough as it is.
> 
>>From the arch code I just want to call out to APEI asking 'is this yours?'. If
> so I expect APEI to have done all the work, if not we take the v8.0 behaviour.
> 
> 
> Here you need APEI and the arch code to spot 'SEA' and treat it differently,
> invoking some arm64-specific behaviour for APEI, and some
> not-really-arch-specific code under /arch/arm64. There is nothing arm64 
> specific
> about your arm_process_error(), how come the core APEI code doesn't need to 
> do this?
> 
> 
> I think this is caused by the way memory_failure() schedules its work, and 
> that
> is where I'd like to try and fix this, so that its the same for all 
> notification
> methods and all (cough: both) architectures.

I try to see if there is a better way.

> 
> 
>> The TIF flag is checked on a generic path, but it will only be set when SEA 
>> occurs.
>> And if we use unlikely optimization, it should have little impact on 
>> performance.
> 
> Yes, the arch code checks _TIF_WORK_MASK in one go so there is no performance
> problem for code that hasn't taken the RAS-Error. (and once we've taken a RAS
> error performance is out the window!)
> 
>> And the TIF flag approach was used on x86 platform for years, until commit 
>> d4812e169d
> 
> ... so x86 doesn't do this ...
> 
>> (x86, mce: Get rid of TIF_MCE_NOTIFY and associated mce tricks)[0]. On 
>> currently arm64
>> platform, there is no IST interrupt[1] function, so we could not call 
>> memory_failure
>> directly in SEA context. So the way to use TIF notification, is also a good 
>> choice,
>> after all, the same way on x86 platform is verified.
> 
> Thanks, looks like I need to read more of the history of x86's kernel-first
> handling...
> 
> 
> Thanks,
> 
> James
> 
> 
> .
> 

-- 
Thanks,
Xie XiuQi

Re: [PATCH v3 1/3] arm64/ras: support sea error recovery

2017-09-11 Thread Xie XiuQi

Hi James,

Thank you very much for your carefully review.

I first describe the approach of this patchset:

A memory access error on the execution path usually triggers SEA.
According to the existing process, errors occurred in the kernel,
leading to direct panic, if it occurred the user-space, we should
just kill process.

But there is a class of error, in fact, is not necessary to kill
process, you can recover and continue to run the process. Such as
the instruction data corrupted, where the memory page might be
read-only, which is has not been modified, the disk might have the
correct data, so you can directly drop the page, ant reload it when
necessary.

So this patchset is just try to solve such problem: if the error is
consumed in user-space and the error occurs on a clean page, you can
directly drop the memory page without killing process.

This is implemented in memory_failure, which is generic process.

memory_failure
  -> hwpoison_user_mappings

/*
 * Propagate the dirty bit from PTEs to struct page first, because we
 * need this to decide if we should kill or just drop the page.
 * XXX: the dirty test could be racy: set_page_dirty() may not always
 * be called inside page lock (it's recommended but not enforced).
 */
mapping = page_mapping(hpage);
if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
mapping_cap_writeback_dirty(mapping)) {
if (page_mkclean(hpage)) {
SetPageDirty(hpage);
} else {
kill = 0;
ttu |= TTU_IGNORE_HWPOISON;
pr_info("Memory failure: %#lx: corrupted page was 
clean: dropped without side effects\n",
pfn);
}
}

The error reported by SEA should be handled before re-enter the process,
or we must kill the process to prevent error propagation.

memory_failure_queue() is asynchronous, in which, error info was saved
at ghes_proc, but handled in kworker. During this period there is a context
switching, so we can not determine which process would be switch to. So
memory_failure_queue is not suitable for handling the problem.

And memory_failure is not nmi-safe, so it can not be called directly in the
SEA context. So we just handle this error at SEA exit path, and before context
switching.

In FFH mode, physical address can only be obtained by parsing the GHES table.
But we only care about SEA, so the error handling is tied to the type of 
notification.

The TIF flag is checked on a generic path, but it will only be set when SEA 
occurs.
And if we use unlikely optimization, it should have little impact on 
performance.

And the TIF flag approach was used on x86 platform for years, until commit 
d4812e169d
(x86, mce: Get rid of TIF_MCE_NOTIFY and associated mce tricks)[0]. On 
currently arm64
platform, there is no IST interrupt[1] function, so we could not call 
memory_failure
directly in SEA context. So the way to use TIF notification, is also a good 
choice,
after all, the same way on x86 platform is verified.

Any comment is welcome, thanks.

[0] https://patchwork.kernel.org/patch/5571021/
[1] [PATCH v4 0/5] x86: Rework IST interrupts 
https://lkml.org/lkml/2014/11/21/632

On 2017/9/9 2:15, James Morse wrote:
> Hi Xie XiuQi,
> 
> (Sorry a few versions of this went past before I caught up with it)
> 
> On 07/09/17 08:45, Xie XiuQi wrote:
>> With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
>> are consumed. In some cases, if the error address is in a clean page or a
>> read-only page, there is a chance to recover. Such as error occurs in a
>> instruction page, we can reread this page from disk instead of killing
>> process.
> 
>> Because memory_failure() may sleep, we can not call it directly in SEA
>> exception context.
> 
> This is why we have memory_failure_queue() instead, it ... bother. That 
> doesn't
> look nmi-safe. (I thought this ended with an llist, but clearly I was looking 
> at
> the wrong thing).
> 
> It doesn't look like this is a problem for NOTIFY_SEA as it would only 
> interrupt
> itself on the same CPU if the memory-failure code/data were corrupt. (which is
> not a case we can handle). We need to fix this before any of the asynchronous
> NMI-like RAS notifications for arm64 get merged.
> 
> (this is one problem, but I don't think its 'the' problem you are trying to
> solve with this series).
> 
> 
>> So we saved faulting physical address associated with
>> a process in the ghes handler and set __TIF_SEA_NOTIFY.
> 
> A per-notification type TIF flag looks fishy, surely this would affect all
> NMI-like RAS notification methods?
> 
> 
>> When we return
>> from SEA exception context and

Re: [PATCH v3 1/3] arm64/ras: support sea error recovery

2017-09-11 Thread Xie XiuQi

Hi James,

Thank you very much for your carefully review.

I first describe the approach of this patchset:

A memory access error on the execution path usually triggers SEA.
According to the existing process, errors occurred in the kernel,
leading to direct panic, if it occurred the user-space, we should
just kill process.

But there is a class of error, in fact, is not necessary to kill
process, you can recover and continue to run the process. Such as
the instruction data corrupted, where the memory page might be
read-only, which is has not been modified, the disk might have the
correct data, so you can directly drop the page, ant reload it when
necessary.

So this patchset is just try to solve such problem: if the error is
consumed in user-space and the error occurs on a clean page, you can
directly drop the memory page without killing process.

This is implemented in memory_failure, which is generic process.

memory_failure
  -> hwpoison_user_mappings

/*
 * Propagate the dirty bit from PTEs to struct page first, because we
 * need this to decide if we should kill or just drop the page.
 * XXX: the dirty test could be racy: set_page_dirty() may not always
 * be called inside page lock (it's recommended but not enforced).
 */
mapping = page_mapping(hpage);
if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
mapping_cap_writeback_dirty(mapping)) {
if (page_mkclean(hpage)) {
SetPageDirty(hpage);
} else {
kill = 0;
ttu |= TTU_IGNORE_HWPOISON;
pr_info("Memory failure: %#lx: corrupted page was 
clean: dropped without side effects\n",
pfn);
}
}

The error reported by SEA should be handled before re-enter the process,
or we must kill the process to prevent error propagation.

memory_failure_queue() is asynchronous, in which, error info was saved
at ghes_proc, but handled in kworker. During this period there is a context
switching, so we can not determine which process would be switch to. So
memory_failure_queue is not suitable for handling the problem.

And memory_failure is not nmi-safe, so it can not be called directly in the
SEA context. So we just handle this error at SEA exit path, and before context
switching.

In FFH mode, physical address can only be obtained by parsing the GHES table.
But we only care about SEA, so the error handling is tied to the type of 
notification.

The TIF flag is checked on a generic path, but it will only be set when SEA 
occurs.
And if we use unlikely optimization, it should have little impact on 
performance.

And the TIF flag approach was used on x86 platform for years, until commit 
d4812e169d
(x86, mce: Get rid of TIF_MCE_NOTIFY and associated mce tricks)[0]. On 
currently arm64
platform, there is no IST interrupt[1] function, so we could not call 
memory_failure
directly in SEA context. So the way to use TIF notification, is also a good 
choice,
after all, the same way on x86 platform is verified.

Any comment is welcome, thanks.

[0] https://patchwork.kernel.org/patch/5571021/
[1] [PATCH v4 0/5] x86: Rework IST interrupts 
https://lkml.org/lkml/2014/11/21/632

On 2017/9/9 2:15, James Morse wrote:
> Hi Xie XiuQi,
> 
> (Sorry a few versions of this went past before I caught up with it)
> 
> On 07/09/17 08:45, Xie XiuQi wrote:
>> With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
>> are consumed. In some cases, if the error address is in a clean page or a
>> read-only page, there is a chance to recover. Such as error occurs in a
>> instruction page, we can reread this page from disk instead of killing
>> process.
> 
>> Because memory_failure() may sleep, we can not call it directly in SEA
>> exception context.
> 
> This is why we have memory_failure_queue() instead, it ... bother. That 
> doesn't
> look nmi-safe. (I thought this ended with an llist, but clearly I was looking 
> at
> the wrong thing).
> 
> It doesn't look like this is a problem for NOTIFY_SEA as it would only 
> interrupt
> itself on the same CPU if the memory-failure code/data were corrupt. (which is
> not a case we can handle). We need to fix this before any of the asynchronous
> NMI-like RAS notifications for arm64 get merged.
> 
> (this is one problem, but I don't think its 'the' problem you are trying to
> solve with this series).
> 
> 
>> So we saved faulting physical address associated with
>> a process in the ghes handler and set __TIF_SEA_NOTIFY.
> 
> A per-notification type TIF flag looks fishy, surely this would affect all
> NMI-like RAS notification methods?
> 
> 
>> When we return
>> from SEA exception context and

[PATCH v3 3/3] arm64/ras: save error address from memory section for recovery

2017-09-07 Thread Xie XiuQi

In some platform, when SEA triggerred, physical address might be
reported by memory section, so we save it for error recovery later.

Signed-off-by: Xie XiuQi <xiexi...@huawei.com>
---
 arch/arm64/kernel/ras.c | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/arch/arm64/kernel/ras.c b/arch/arm64/kernel/ras.c
index 797722d..b24b449 100644
--- a/arch/arm64/kernel/ras.c
+++ b/arch/arm64/kernel/ras.c
@@ -139,3 +139,34 @@ void arm_process_error(struct ghes *ghes, struct 
cper_sec_proc_arm *err)
if (info_saved)
set_thread_flag(TIF_SEA_NOTIFY);
 }
+
+int ghes_mem_err_callback(struct notifier_block *nb, unsigned long val, void 
*data)
+{
+   bool info_saved = false;
+   struct ghes_mem_err *ghes_mem = (struct ghes_mem_err *)data;
+   struct cper_sec_mem_err *mem_err = ghes_mem->mem_err;
+
+   if ((ghes_mem->notify_type != ACPI_HEST_NOTIFY_SEA) ||
+   (ghes_mem->severity != CPER_SEV_RECOVERABLE))
+   return 0;
+
+   if (mem_err->validation_bits & CPER_MEM_VALID_PA)
+   info_saved = sea_save_info(mem_err->physical_addr);
+
+   if (info_saved)
+   set_thread_flag(TIF_SEA_NOTIFY);
+
+   return 0;
+}
+
+static struct notifier_block ghes_mem_err_nb = {
+   .notifier_call  = ghes_mem_err_callback,
+};
+
+static int arm64_err_recov_init(void)
+{
+   atomic_notifier_chain_register(_mem_err_chain, _mem_err_nb);
+   return 0;
+}
+
+late_initcall(arm64_err_recov_init);
-- 
1.8.3.1

[PATCH v3 3/3] arm64/ras: save error address from memory section for recovery

2017-09-07 Thread Xie XiuQi

In some platform, when SEA triggerred, physical address might be
reported by memory section, so we save it for error recovery later.

Signed-off-by: Xie XiuQi 
---
 arch/arm64/kernel/ras.c | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/arch/arm64/kernel/ras.c b/arch/arm64/kernel/ras.c
index 797722d..b24b449 100644
--- a/arch/arm64/kernel/ras.c
+++ b/arch/arm64/kernel/ras.c
@@ -139,3 +139,34 @@ void arm_process_error(struct ghes *ghes, struct 
cper_sec_proc_arm *err)
if (info_saved)
set_thread_flag(TIF_SEA_NOTIFY);
 }
+
+int ghes_mem_err_callback(struct notifier_block *nb, unsigned long val, void 
*data)
+{
+   bool info_saved = false;
+   struct ghes_mem_err *ghes_mem = (struct ghes_mem_err *)data;
+   struct cper_sec_mem_err *mem_err = ghes_mem->mem_err;
+
+   if ((ghes_mem->notify_type != ACPI_HEST_NOTIFY_SEA) ||
+   (ghes_mem->severity != CPER_SEV_RECOVERABLE))
+   return 0;
+
+   if (mem_err->validation_bits & CPER_MEM_VALID_PA)
+   info_saved = sea_save_info(mem_err->physical_addr);
+
+   if (info_saved)
+   set_thread_flag(TIF_SEA_NOTIFY);
+
+   return 0;
+}
+
+static struct notifier_block ghes_mem_err_nb = {
+   .notifier_call  = ghes_mem_err_callback,
+};
+
+static int arm64_err_recov_init(void)
+{
+   atomic_notifier_chain_register(_mem_err_chain, _mem_err_nb);
+   return 0;
+}
+
+late_initcall(arm64_err_recov_init);
-- 
1.8.3.1

[PATCH v3 1/3] arm64/ras: support sea error recovery

2017-09-07 Thread Xie XiuQi

With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
are consumed. In some cases, if the error address is in a clean page or a
read-only page, there is a chance to recover. Such as error occurs in a
instruction page, we can reread this page from disk instead of killing
process.

Because memory_failure() may sleep, we can not call it directly in SEA
exception context. So we saved faulting physical address associated with
a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return
from SEA exception context and get into do_notify_resume() before the
process running, we could check it and call memory_failure() to do
recovery. It's safe, because we are in process context.

Signed-off-by: Xie XiuQi <xiexi...@huawei.com>
Signed-off-by: Wang Xiongfeng <wangxiongfe...@huawei.com>
---
 arch/arm64/Kconfig   |  11 +++
 arch/arm64/include/asm/ras.h |  36 +
 arch/arm64/include/asm/thread_info.h |   4 +-
 arch/arm64/kernel/Makefile   |   1 +
 arch/arm64/kernel/ras.c  | 141 +++
 arch/arm64/kernel/signal.c   |   8 ++
 arch/arm64/mm/fault.c|  27 +--
 drivers/acpi/apei/ghes.c |   4 +-
 8 files changed, 221 insertions(+), 11 deletions(-)
 create mode 100644 arch/arm64/include/asm/ras.h
 create mode 100644 arch/arm64/kernel/ras.c

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index dfd9086..7d44589 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -640,6 +640,17 @@ config HOTPLUG_CPU
  Say Y here to experiment with turning CPUs off and on.  CPUs
  can be controlled through /sys/devices/system/cpu.
 
+config ARM64_ERR_RECOV
+   bool "Support arm64 RAS error recovery"
+   depends on ACPI_APEI_SEA && MEMORY_FAILURE
+   help
+ With ARM v8.2 RAS Extension, SEA are usually triggered when memory 
errors
+ are consumed. In some cases, if the error address is in a clean page 
or a
+ read-only page, there is a chance to recover. Such as error occurs in 
a
+ instruction page, we can reread this page from disk instead of 
killing process.
+
+ Say Y if unsure.
+
 # Common NUMA Features
 config NUMA
bool "Numa Memory Allocation and Scheduler Support"
diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h
new file mode 100644
index 000..e174f95
--- /dev/null
+++ b/arch/arm64/include/asm/ras.h
@@ -0,0 +1,36 @@
+/*
+ * ARM64 SEA error recoery support
+ *
+ * Copyright 2017 Huawei Technologies Co., Ltd.
+ *   Author: Xie XiuQi <xiexi...@huawei.com>
+ *   Author: Wang Xiongfeng <wangxiongfe...@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _ASM_RAS_H
+#define _ASM_RAS_H
+
+#include 
+#include 
+#include 
+
+extern void sea_notify_process(void);
+
+#ifdef CONFIG_ARM64_ERR_RECOV
+extern void arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm 
*err);
+#else
+static inline void arm_process_error(struct ghes *ghes, struct 
cper_sec_proc_arm *err)
+{
+   log_arm_hw_error(err);
+}
+#endif /* CONFIG_ARM64_ERR_RECOV */
+
+#endif /*_ASM_RAS_H*/
diff --git a/arch/arm64/include/asm/thread_info.h 
b/arch/arm64/include/asm/thread_info.h
index 46c3b93..4b10131 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -86,6 +86,7 @@ struct thread_info {
 #define TIF_NOTIFY_RESUME  2   /* callback before returning to user */
 #define TIF_FOREIGN_FPSTATE3   /* CPU's FP state is not current's */
 #define TIF_UPROBE 4   /* uprobe breakpoint or singlestep */
+#define TIF_SEA_NOTIFY  5   /* notify to do an error recovery */
 #define TIF_NOHZ   7
 #define TIF_SYSCALL_TRACE  8
 #define TIF_SYSCALL_AUDIT  9
@@ -102,6 +103,7 @@ struct thread_info {
 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
 #define _TIF_FOREIGN_FPSTATE   (1 << TIF_FOREIGN_FPSTATE)
 #define _TIF_NOHZ  (1 << TIF_NOHZ)
+#define _TIF_SEA_NOTIFY (1 << TIF_SEA_NOTIFY)
 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
 #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SYSCALL_TRACEPOINT(1 << TIF_SYSCALL_TRACEPOINT)
@@ -111,7 +113,7 @@ struct thread_info {
 
 #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
-

[PATCH v3 1/3] arm64/ras: support sea error recovery

2017-09-07 Thread Xie XiuQi

With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
are consumed. In some cases, if the error address is in a clean page or a
read-only page, there is a chance to recover. Such as error occurs in a
instruction page, we can reread this page from disk instead of killing
process.

Because memory_failure() may sleep, we can not call it directly in SEA
exception context. So we saved faulting physical address associated with
a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return
from SEA exception context and get into do_notify_resume() before the
process running, we could check it and call memory_failure() to do
recovery. It's safe, because we are in process context.

Signed-off-by: Xie XiuQi 
Signed-off-by: Wang Xiongfeng 
---
 arch/arm64/Kconfig   |  11 +++
 arch/arm64/include/asm/ras.h |  36 +
 arch/arm64/include/asm/thread_info.h |   4 +-
 arch/arm64/kernel/Makefile   |   1 +
 arch/arm64/kernel/ras.c  | 141 +++
 arch/arm64/kernel/signal.c   |   8 ++
 arch/arm64/mm/fault.c|  27 +--
 drivers/acpi/apei/ghes.c |   4 +-
 8 files changed, 221 insertions(+), 11 deletions(-)
 create mode 100644 arch/arm64/include/asm/ras.h
 create mode 100644 arch/arm64/kernel/ras.c

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index dfd9086..7d44589 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -640,6 +640,17 @@ config HOTPLUG_CPU
  Say Y here to experiment with turning CPUs off and on.  CPUs
  can be controlled through /sys/devices/system/cpu.
 
+config ARM64_ERR_RECOV
+   bool "Support arm64 RAS error recovery"
+   depends on ACPI_APEI_SEA && MEMORY_FAILURE
+   help
+ With ARM v8.2 RAS Extension, SEA are usually triggered when memory 
errors
+ are consumed. In some cases, if the error address is in a clean page 
or a
+ read-only page, there is a chance to recover. Such as error occurs in 
a
+ instruction page, we can reread this page from disk instead of 
killing process.
+
+ Say Y if unsure.
+
 # Common NUMA Features
 config NUMA
bool "Numa Memory Allocation and Scheduler Support"
diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h
new file mode 100644
index 000..e174f95
--- /dev/null
+++ b/arch/arm64/include/asm/ras.h
@@ -0,0 +1,36 @@
+/*
+ * ARM64 SEA error recoery support
+ *
+ * Copyright 2017 Huawei Technologies Co., Ltd.
+ *   Author: Xie XiuQi 
+ *   Author: Wang Xiongfeng 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _ASM_RAS_H
+#define _ASM_RAS_H
+
+#include 
+#include 
+#include 
+
+extern void sea_notify_process(void);
+
+#ifdef CONFIG_ARM64_ERR_RECOV
+extern void arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm 
*err);
+#else
+static inline void arm_process_error(struct ghes *ghes, struct 
cper_sec_proc_arm *err)
+{
+   log_arm_hw_error(err);
+}
+#endif /* CONFIG_ARM64_ERR_RECOV */
+
+#endif /*_ASM_RAS_H*/
diff --git a/arch/arm64/include/asm/thread_info.h 
b/arch/arm64/include/asm/thread_info.h
index 46c3b93..4b10131 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -86,6 +86,7 @@ struct thread_info {
 #define TIF_NOTIFY_RESUME  2   /* callback before returning to user */
 #define TIF_FOREIGN_FPSTATE3   /* CPU's FP state is not current's */
 #define TIF_UPROBE 4   /* uprobe breakpoint or singlestep */
+#define TIF_SEA_NOTIFY  5   /* notify to do an error recovery */
 #define TIF_NOHZ   7
 #define TIF_SYSCALL_TRACE  8
 #define TIF_SYSCALL_AUDIT  9
@@ -102,6 +103,7 @@ struct thread_info {
 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
 #define _TIF_FOREIGN_FPSTATE   (1 << TIF_FOREIGN_FPSTATE)
 #define _TIF_NOHZ  (1 << TIF_NOHZ)
+#define _TIF_SEA_NOTIFY (1 << TIF_SEA_NOTIFY)
 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
 #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SYSCALL_TRACEPOINT(1 << TIF_SYSCALL_TRACEPOINT)
@@ -111,7 +113,7 @@ struct thread_info {
 
 #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
-_TIF_UPROBE)
+_TIF_UPROBE|_TIF_SEA_NOTIFY)
 
 #

[PATCH v3 2/3] GHES: add a notify chain for process memory section

2017-09-07 Thread Xie XiuQi

Add a notify chain for process memory section, with
which other modules might do error recovery.

Signed-off-by: Xie XiuQi <xiexi...@huawei.com>
---
 drivers/acpi/apei/ghes.c | 10 ++
 include/acpi/ghes.h  |  8 
 2 files changed, 18 insertions(+)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 502335c..5b310b7 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -108,6 +108,9 @@ static inline bool is_hest_type_generic_v2(struct ghes 
*ghes)
 static LIST_HEAD(ghes_hed);
 static DEFINE_MUTEX(ghes_list_mutex);
 
+ATOMIC_NOTIFIER_HEAD(ghes_mem_err_chain);
+EXPORT_SYMBOL(ghes_mem_err_chain);
+
 /*
  * Because the memory area used to transfer hardware error information
  * from BIOS to Linux can be determined only in NMI, IRQ or timer
@@ -479,6 +482,13 @@ static void ghes_do_proc(struct ghes *ghes,
 
if (guid_equal(sec_type, _SEC_PLATFORM_MEM)) {
struct cper_sec_mem_err *mem_err = 
acpi_hest_get_payload(gdata);
+   struct ghes_mem_err mem;
+
+   mem.notify_type = ghes->generic->notify.type;
+   mem.severity = gdata->error_severity;
+   mem.mem_err = mem_err;
+
+   atomic_notifier_call_chain(_mem_err_chain, 0, 
);
 
ghes_edac_report_mem_error(ghes, sev, mem_err);
 
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 9f26e01..df3f6d9 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -115,4 +115,12 @@ static inline void *acpi_hest_get_next(struct 
acpi_hest_generic_data *gdata)
 
 int ghes_notify_sea(void);
 
+struct ghes_mem_err {
+   int notify_type;
+   int severity;
+   struct cper_sec_mem_err *mem_err;
+};
+
+extern struct atomic_notifier_head ghes_mem_err_chain;
+
 #endif /* GHES_H */
-- 
1.8.3.1

[PATCH v3 0/3] arm64/ras: support sea error recovery

2017-09-07 Thread Xie XiuQi

With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
are consumed. In some cases, if the error address is in a clean page or a
read-only page, there is a chance to recover. Such as error occurs in a
instruction page, we can reread this page from disk instead of killing
process.

Because memory_failure() may sleep, we can not call it directly in SEA
exception context. So we saved faulting physical address associated with
a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return
from SEA exception context and get into do_notify_resume() before the
process running, we could check it and call memory_failure() to do
recovery. It's safe, because we are in process context.

In some platform, when SEA triggerred, physical address could be reported
by memory section or by processor section, so we save address at this two
place.

---
v3 - v2:
  - fix patch style issue

v2 - v1:
  - wrap arm_proc_error_check and log_arm_hw_error in a single 
arm_process_error()
  - fix sea_save_info return value issue
  - fix link error if this CONFIG_ARM64_ERR_RECOV is not selected
  - use a notify chain instead of call arch_apei_report_mem_error directly

  https://lkml.org/lkml/2017/9/1/189

Xie XiuQi (3):
  arm64/ras: support sea error recovery
  GHES: add a notify chain for process memory section
  arm64/ras: save error address from memory section for recovery

 arch/arm64/Kconfig   |  11 +++
 arch/arm64/include/asm/ras.h |  36 
 arch/arm64/include/asm/thread_info.h |   4 +-
 arch/arm64/kernel/Makefile   |   1 +
 arch/arm64/kernel/ras.c  | 172 +++
 arch/arm64/kernel/signal.c   |   8 ++
 arch/arm64/mm/fault.c|  27 --
 drivers/acpi/apei/ghes.c |  14 ++-
 include/acpi/ghes.h  |   8 ++
 9 files changed, 270 insertions(+), 11 deletions(-)
 create mode 100644 arch/arm64/include/asm/ras.h
 create mode 100644 arch/arm64/kernel/ras.c

-- 
1.8.3.1

[PATCH v3 2/3] GHES: add a notify chain for process memory section

2017-09-07 Thread Xie XiuQi

Add a notify chain for process memory section, with
which other modules might do error recovery.

Signed-off-by: Xie XiuQi 
---
 drivers/acpi/apei/ghes.c | 10 ++
 include/acpi/ghes.h  |  8 
 2 files changed, 18 insertions(+)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 502335c..5b310b7 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -108,6 +108,9 @@ static inline bool is_hest_type_generic_v2(struct ghes 
*ghes)
 static LIST_HEAD(ghes_hed);
 static DEFINE_MUTEX(ghes_list_mutex);
 
+ATOMIC_NOTIFIER_HEAD(ghes_mem_err_chain);
+EXPORT_SYMBOL(ghes_mem_err_chain);
+
 /*
  * Because the memory area used to transfer hardware error information
  * from BIOS to Linux can be determined only in NMI, IRQ or timer
@@ -479,6 +482,13 @@ static void ghes_do_proc(struct ghes *ghes,
 
if (guid_equal(sec_type, _SEC_PLATFORM_MEM)) {
struct cper_sec_mem_err *mem_err = 
acpi_hest_get_payload(gdata);
+   struct ghes_mem_err mem;
+
+   mem.notify_type = ghes->generic->notify.type;
+   mem.severity = gdata->error_severity;
+   mem.mem_err = mem_err;
+
+   atomic_notifier_call_chain(_mem_err_chain, 0, 
);
 
ghes_edac_report_mem_error(ghes, sev, mem_err);
 
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 9f26e01..df3f6d9 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -115,4 +115,12 @@ static inline void *acpi_hest_get_next(struct 
acpi_hest_generic_data *gdata)
 
 int ghes_notify_sea(void);
 
+struct ghes_mem_err {
+   int notify_type;
+   int severity;
+   struct cper_sec_mem_err *mem_err;
+};
+
+extern struct atomic_notifier_head ghes_mem_err_chain;
+
 #endif /* GHES_H */
-- 
1.8.3.1

[PATCH v3 0/3] arm64/ras: support sea error recovery

2017-09-07 Thread Xie XiuQi

With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
are consumed. In some cases, if the error address is in a clean page or a
read-only page, there is a chance to recover. Such as error occurs in a
instruction page, we can reread this page from disk instead of killing
process.

Because memory_failure() may sleep, we can not call it directly in SEA
exception context. So we saved faulting physical address associated with
a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return
from SEA exception context and get into do_notify_resume() before the
process running, we could check it and call memory_failure() to do
recovery. It's safe, because we are in process context.

In some platform, when SEA triggerred, physical address could be reported
by memory section or by processor section, so we save address at this two
place.

---
v3 - v2:
  - fix patch style issue

v2 - v1:
  - wrap arm_proc_error_check and log_arm_hw_error in a single 
arm_process_error()
  - fix sea_save_info return value issue
  - fix link error if this CONFIG_ARM64_ERR_RECOV is not selected
  - use a notify chain instead of call arch_apei_report_mem_error directly

  https://lkml.org/lkml/2017/9/1/189

Xie XiuQi (3):
  arm64/ras: support sea error recovery
  GHES: add a notify chain for process memory section
  arm64/ras: save error address from memory section for recovery

 arch/arm64/Kconfig   |  11 +++
 arch/arm64/include/asm/ras.h |  36 
 arch/arm64/include/asm/thread_info.h |   4 +-
 arch/arm64/kernel/Makefile   |   1 +
 arch/arm64/kernel/ras.c  | 172 +++
 arch/arm64/kernel/signal.c   |   8 ++
 arch/arm64/mm/fault.c|  27 --
 drivers/acpi/apei/ghes.c |  14 ++-
 include/acpi/ghes.h  |   8 ++
 9 files changed, 270 insertions(+), 11 deletions(-)
 create mode 100644 arch/arm64/include/asm/ras.h
 create mode 100644 arch/arm64/kernel/ras.c

-- 
1.8.3.1

Re: [PATCH v2 1/3] arm64/ras: support sea error recovery

2017-09-06 Thread Xie XiuQi

Hi Borislav,

On 2017/9/6 18:12, Borislav Petkov wrote:
> On Tue, Sep 05, 2017 at 07:06:04PM +0800, Xie XiuQi wrote:
>> With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
>> are consumed. In some cases, if the error address is in a clean page or a
>> read-only page, there is a chance to recover. Such as error occurs in a
>> instruction page, we can reread this page from disk instead of killing 
>> process.
>>
>> Because memory_failure() may sleep, we can not call it directly in SEA 
>> exception
>> context. So we saved faulting physical address associated with a process in 
>> the
>> ghes handler and set __TIF_SEA_NOTIFY. When we return from SEA exception 
>> context
>> and get into do_notify_resume() before the process running, we could check it
>> and call memory_failure() to do recovery. It's safe, because we are in 
>> process
>> context.
>>
>> Signed-off-by: Xie XiuQi <xiexi...@huawei.com>
>> Signed-off-by: Wang Xiongfeng <wangxiongfe...@huawei.com>
>> ---
>>  arch/arm64/Kconfig   |  11 +++
>>  arch/arm64/include/asm/ras.h |  36 +
>>  arch/arm64/include/asm/thread_info.h |   4 +-
>>  arch/arm64/kernel/Makefile   |   1 +
>>  arch/arm64/kernel/ras.c  | 143 
>> +++
>>  arch/arm64/kernel/signal.c   |   8 ++
>>  arch/arm64/mm/fault.c|  27 +--
>>  drivers/acpi/apei/ghes.c |   4 +-
>>  8 files changed, 223 insertions(+), 11 deletions(-)
>>  create mode 100644 arch/arm64/include/asm/ras.h
>>  create mode 100644 arch/arm64/kernel/ras.c
> 
> Please integrate scripts/checkpatch.pl into your patch creation workflow
> and run all patches through it before submitting:

Sorry for my mistake. I'll fix it, thanks.

> 
> ERROR: code indent should use tabs where possible
> #200: FILE: arch/arm64/kernel/ras.c:42:
> +atomic_tinuse;$
> 
> WARNING: please, no spaces at the start of a line
> #200: FILE: arch/arm64/kernel/ras.c:42:
> +atomic_tinuse;$
> 
> ERROR: code indent should use tabs where possible
> #201: FILE: arch/arm64/kernel/ras.c:43:
> +struct task_struct  *t;$
> 
> WARNING: please, no spaces at the start of a line
> #201: FILE: arch/arm64/kernel/ras.c:43:
> +struct task_struct  *t;$
> 
> ERROR: code indent should use tabs where possible
> #202: FILE: arch/arm64/kernel/ras.c:44:
> +__u64   paddr;$
> 
> WARNING: please, no spaces at the start of a line
> #202: FILE: arch/arm64/kernel/ras.c:44:
> +__u64   paddr;$
> 
> ERROR: code indent should use tabs where possible
> #207: FILE: arch/arm64/kernel/ras.c:49:
> +struct sea_info *si;$
> 
> WARNING: please, no spaces at the start of a line
> #207: FILE: arch/arm64/kernel/ras.c:49:
> +struct sea_info *si;$
> 
> ERROR: code indent should use tabs where possible
> #209: FILE: arch/arm64/kernel/ras.c:51:
> +for (si = sea_info; si < _info[SEA_INFO_MAX]; si++) {$
> 
> WARNING: please, no spaces at the start of a line
> #209: FILE: arch/arm64/kernel/ras.c:51:
> +for (si = sea_info; si < _info[SEA_INFO_MAX]; si++) {$
> 
> ERROR: code indent should use tabs where possible
> #210: FILE: arch/arm64/kernel/ras.c:52:
> +if (atomic_cmpxchg(>inuse, 0, 1) == 0) {$
> 
> WARNING: please, no spaces at the start of a line
> #210: FILE: arch/arm64/kernel/ras.c:52:
> +if (atomic_cmpxchg(>inuse, 0, 1) == 0) {$
> 
> ERROR: code indent should use tabs where possible
> #211: FILE: arch/arm64/kernel/ras.c:53:
> +si->t = current;$
> 
> WARNING: please, no spaces at the start of a line
> #211: FILE: arch/arm64/kernel/ras.c:53:
> +si->t = current;$
> 
> ERROR: code indent should use tabs where possible
> #212: FILE: arch/arm64/kernel/ras.c:54:
> +si->paddr = addr;$
> 
> WARNING: please, no spaces at the start of a line
> #212: FILE: arch/arm64/kernel/ras.c:54:
> +si->paddr = addr;$
> 
> ERROR: code indent should use tabs where possible
> #213: FILE: arch/arm64/kernel/ras.c:55:
> +return true;$
> 
> WARNING: please, no spaces at the start of a line
> #213: FILE: arch/arm64/kernel/ras.c:55:
> +return true;$
> 
> ERROR: code indent should use tabs where possible
> #214: FILE: arch/arm64/kernel/ras.c:56:
> +}$
> 
> WARNING: please, no spaces at the st

Re: [PATCH v2 1/3] arm64/ras: support sea error recovery

2017-09-06 Thread Xie XiuQi

Hi Borislav,

On 2017/9/6 18:12, Borislav Petkov wrote:
> On Tue, Sep 05, 2017 at 07:06:04PM +0800, Xie XiuQi wrote:
>> With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
>> are consumed. In some cases, if the error address is in a clean page or a
>> read-only page, there is a chance to recover. Such as error occurs in a
>> instruction page, we can reread this page from disk instead of killing 
>> process.
>>
>> Because memory_failure() may sleep, we can not call it directly in SEA 
>> exception
>> context. So we saved faulting physical address associated with a process in 
>> the
>> ghes handler and set __TIF_SEA_NOTIFY. When we return from SEA exception 
>> context
>> and get into do_notify_resume() before the process running, we could check it
>> and call memory_failure() to do recovery. It's safe, because we are in 
>> process
>> context.
>>
>> Signed-off-by: Xie XiuQi 
>> Signed-off-by: Wang Xiongfeng 
>> ---
>>  arch/arm64/Kconfig   |  11 +++
>>  arch/arm64/include/asm/ras.h |  36 +
>>  arch/arm64/include/asm/thread_info.h |   4 +-
>>  arch/arm64/kernel/Makefile   |   1 +
>>  arch/arm64/kernel/ras.c  | 143 
>> +++
>>  arch/arm64/kernel/signal.c   |   8 ++
>>  arch/arm64/mm/fault.c|  27 +--
>>  drivers/acpi/apei/ghes.c |   4 +-
>>  8 files changed, 223 insertions(+), 11 deletions(-)
>>  create mode 100644 arch/arm64/include/asm/ras.h
>>  create mode 100644 arch/arm64/kernel/ras.c
> 
> Please integrate scripts/checkpatch.pl into your patch creation workflow
> and run all patches through it before submitting:

Sorry for my mistake. I'll fix it, thanks.

> 
> ERROR: code indent should use tabs where possible
> #200: FILE: arch/arm64/kernel/ras.c:42:
> +atomic_tinuse;$
> 
> WARNING: please, no spaces at the start of a line
> #200: FILE: arch/arm64/kernel/ras.c:42:
> +atomic_tinuse;$
> 
> ERROR: code indent should use tabs where possible
> #201: FILE: arch/arm64/kernel/ras.c:43:
> +struct task_struct  *t;$
> 
> WARNING: please, no spaces at the start of a line
> #201: FILE: arch/arm64/kernel/ras.c:43:
> +struct task_struct  *t;$
> 
> ERROR: code indent should use tabs where possible
> #202: FILE: arch/arm64/kernel/ras.c:44:
> +__u64   paddr;$
> 
> WARNING: please, no spaces at the start of a line
> #202: FILE: arch/arm64/kernel/ras.c:44:
> +__u64   paddr;$
> 
> ERROR: code indent should use tabs where possible
> #207: FILE: arch/arm64/kernel/ras.c:49:
> +struct sea_info *si;$
> 
> WARNING: please, no spaces at the start of a line
> #207: FILE: arch/arm64/kernel/ras.c:49:
> +struct sea_info *si;$
> 
> ERROR: code indent should use tabs where possible
> #209: FILE: arch/arm64/kernel/ras.c:51:
> +for (si = sea_info; si < _info[SEA_INFO_MAX]; si++) {$
> 
> WARNING: please, no spaces at the start of a line
> #209: FILE: arch/arm64/kernel/ras.c:51:
> +for (si = sea_info; si < _info[SEA_INFO_MAX]; si++) {$
> 
> ERROR: code indent should use tabs where possible
> #210: FILE: arch/arm64/kernel/ras.c:52:
> +if (atomic_cmpxchg(>inuse, 0, 1) == 0) {$
> 
> WARNING: please, no spaces at the start of a line
> #210: FILE: arch/arm64/kernel/ras.c:52:
> +if (atomic_cmpxchg(>inuse, 0, 1) == 0) {$
> 
> ERROR: code indent should use tabs where possible
> #211: FILE: arch/arm64/kernel/ras.c:53:
> +si->t = current;$
> 
> WARNING: please, no spaces at the start of a line
> #211: FILE: arch/arm64/kernel/ras.c:53:
> +si->t = current;$
> 
> ERROR: code indent should use tabs where possible
> #212: FILE: arch/arm64/kernel/ras.c:54:
> +si->paddr = addr;$
> 
> WARNING: please, no spaces at the start of a line
> #212: FILE: arch/arm64/kernel/ras.c:54:
> +si->paddr = addr;$
> 
> ERROR: code indent should use tabs where possible
> #213: FILE: arch/arm64/kernel/ras.c:55:
> +return true;$
> 
> WARNING: please, no spaces at the start of a line
> #213: FILE: arch/arm64/kernel/ras.c:55:
> +return true;$
> 
> ERROR: code indent should use tabs where possible
> #214: FILE: arch/arm64/kernel/ras.c:56:
> +}$
> 
> WARNING: please, no spaces at the start of a line
> #214: FILE: arch/arm64/kernel/ras.c:56:
>

[PATCH v2 1/3] arm64/ras: support sea error recovery

2017-09-05 Thread Xie XiuQi

With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
are consumed. In some cases, if the error address is in a clean page or a
read-only page, there is a chance to recover. Such as error occurs in a
instruction page, we can reread this page from disk instead of killing process.

Because memory_failure() may sleep, we can not call it directly in SEA exception
context. So we saved faulting physical address associated with a process in the
ghes handler and set __TIF_SEA_NOTIFY. When we return from SEA exception context
and get into do_notify_resume() before the process running, we could check it
and call memory_failure() to do recovery. It's safe, because we are in process
context.

Signed-off-by: Xie XiuQi <xiexi...@huawei.com>
Signed-off-by: Wang Xiongfeng <wangxiongfe...@huawei.com>
---
 arch/arm64/Kconfig   |  11 +++
 arch/arm64/include/asm/ras.h |  36 +
 arch/arm64/include/asm/thread_info.h |   4 +-
 arch/arm64/kernel/Makefile   |   1 +
 arch/arm64/kernel/ras.c  | 143 +++
 arch/arm64/kernel/signal.c   |   8 ++
 arch/arm64/mm/fault.c|  27 +--
 drivers/acpi/apei/ghes.c |   4 +-
 8 files changed, 223 insertions(+), 11 deletions(-)
 create mode 100644 arch/arm64/include/asm/ras.h
 create mode 100644 arch/arm64/kernel/ras.c

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index dfd9086..7d44589 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -640,6 +640,17 @@ config HOTPLUG_CPU
  Say Y here to experiment with turning CPUs off and on.  CPUs
  can be controlled through /sys/devices/system/cpu.
 
+config ARM64_ERR_RECOV
+   bool "Support arm64 RAS error recovery"
+   depends on ACPI_APEI_SEA && MEMORY_FAILURE
+   help
+ With ARM v8.2 RAS Extension, SEA are usually triggered when memory 
errors
+ are consumed. In some cases, if the error address is in a clean page 
or a
+ read-only page, there is a chance to recover. Such as error occurs in 
a
+ instruction page, we can reread this page from disk instead of 
killing process.
+
+ Say Y if unsure.
+
 # Common NUMA Features
 config NUMA
bool "Numa Memory Allocation and Scheduler Support"
diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h
new file mode 100644
index 000..e174f95
--- /dev/null
+++ b/arch/arm64/include/asm/ras.h
@@ -0,0 +1,36 @@
+/*
+ * ARM64 SEA error recoery support
+ *
+ * Copyright 2017 Huawei Technologies Co., Ltd.
+ *   Author: Xie XiuQi <xiexi...@huawei.com>
+ *   Author: Wang Xiongfeng <wangxiongfe...@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _ASM_RAS_H
+#define _ASM_RAS_H
+
+#include 
+#include 
+#include 
+
+extern void sea_notify_process(void);
+
+#ifdef CONFIG_ARM64_ERR_RECOV
+extern void arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm 
*err);
+#else
+static inline void arm_process_error(struct ghes *ghes, struct 
cper_sec_proc_arm *err)
+{
+   log_arm_hw_error(err);
+}
+#endif /* CONFIG_ARM64_ERR_RECOV */
+
+#endif /*_ASM_RAS_H*/
diff --git a/arch/arm64/include/asm/thread_info.h 
b/arch/arm64/include/asm/thread_info.h
index 46c3b93..4b10131 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -86,6 +86,7 @@ struct thread_info {
 #define TIF_NOTIFY_RESUME  2   /* callback before returning to user */
 #define TIF_FOREIGN_FPSTATE3   /* CPU's FP state is not current's */
 #define TIF_UPROBE 4   /* uprobe breakpoint or singlestep */
+#define TIF_SEA_NOTIFY  5   /* notify to do an error recovery */
 #define TIF_NOHZ   7
 #define TIF_SYSCALL_TRACE  8
 #define TIF_SYSCALL_AUDIT  9
@@ -102,6 +103,7 @@ struct thread_info {
 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
 #define _TIF_FOREIGN_FPSTATE   (1 << TIF_FOREIGN_FPSTATE)
 #define _TIF_NOHZ  (1 << TIF_NOHZ)
+#define _TIF_SEA_NOTIFY (1 << TIF_SEA_NOTIFY)
 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
 #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SYSCALL_TRACEPOINT(1 << TIF_SYSCALL_TRACEPOINT)
@@ -111,7 +113,7 @@ struct thread_info {
 
 #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
-

[PATCH v2 1/3] arm64/ras: support sea error recovery

2017-09-05 Thread Xie XiuQi

With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
are consumed. In some cases, if the error address is in a clean page or a
read-only page, there is a chance to recover. Such as error occurs in a
instruction page, we can reread this page from disk instead of killing process.

Because memory_failure() may sleep, we can not call it directly in SEA exception
context. So we saved faulting physical address associated with a process in the
ghes handler and set __TIF_SEA_NOTIFY. When we return from SEA exception context
and get into do_notify_resume() before the process running, we could check it
and call memory_failure() to do recovery. It's safe, because we are in process
context.

Signed-off-by: Xie XiuQi 
Signed-off-by: Wang Xiongfeng 
---
 arch/arm64/Kconfig   |  11 +++
 arch/arm64/include/asm/ras.h |  36 +
 arch/arm64/include/asm/thread_info.h |   4 +-
 arch/arm64/kernel/Makefile   |   1 +
 arch/arm64/kernel/ras.c  | 143 +++
 arch/arm64/kernel/signal.c   |   8 ++
 arch/arm64/mm/fault.c|  27 +--
 drivers/acpi/apei/ghes.c |   4 +-
 8 files changed, 223 insertions(+), 11 deletions(-)
 create mode 100644 arch/arm64/include/asm/ras.h
 create mode 100644 arch/arm64/kernel/ras.c

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index dfd9086..7d44589 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -640,6 +640,17 @@ config HOTPLUG_CPU
  Say Y here to experiment with turning CPUs off and on.  CPUs
  can be controlled through /sys/devices/system/cpu.
 
+config ARM64_ERR_RECOV
+   bool "Support arm64 RAS error recovery"
+   depends on ACPI_APEI_SEA && MEMORY_FAILURE
+   help
+ With ARM v8.2 RAS Extension, SEA are usually triggered when memory 
errors
+ are consumed. In some cases, if the error address is in a clean page 
or a
+ read-only page, there is a chance to recover. Such as error occurs in 
a
+ instruction page, we can reread this page from disk instead of 
killing process.
+
+ Say Y if unsure.
+
 # Common NUMA Features
 config NUMA
bool "Numa Memory Allocation and Scheduler Support"
diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h
new file mode 100644
index 000..e174f95
--- /dev/null
+++ b/arch/arm64/include/asm/ras.h
@@ -0,0 +1,36 @@
+/*
+ * ARM64 SEA error recoery support
+ *
+ * Copyright 2017 Huawei Technologies Co., Ltd.
+ *   Author: Xie XiuQi 
+ *   Author: Wang Xiongfeng 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _ASM_RAS_H
+#define _ASM_RAS_H
+
+#include 
+#include 
+#include 
+
+extern void sea_notify_process(void);
+
+#ifdef CONFIG_ARM64_ERR_RECOV
+extern void arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm 
*err);
+#else
+static inline void arm_process_error(struct ghes *ghes, struct 
cper_sec_proc_arm *err)
+{
+   log_arm_hw_error(err);
+}
+#endif /* CONFIG_ARM64_ERR_RECOV */
+
+#endif /*_ASM_RAS_H*/
diff --git a/arch/arm64/include/asm/thread_info.h 
b/arch/arm64/include/asm/thread_info.h
index 46c3b93..4b10131 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -86,6 +86,7 @@ struct thread_info {
 #define TIF_NOTIFY_RESUME  2   /* callback before returning to user */
 #define TIF_FOREIGN_FPSTATE3   /* CPU's FP state is not current's */
 #define TIF_UPROBE 4   /* uprobe breakpoint or singlestep */
+#define TIF_SEA_NOTIFY  5   /* notify to do an error recovery */
 #define TIF_NOHZ   7
 #define TIF_SYSCALL_TRACE  8
 #define TIF_SYSCALL_AUDIT  9
@@ -102,6 +103,7 @@ struct thread_info {
 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
 #define _TIF_FOREIGN_FPSTATE   (1 << TIF_FOREIGN_FPSTATE)
 #define _TIF_NOHZ  (1 << TIF_NOHZ)
+#define _TIF_SEA_NOTIFY (1 << TIF_SEA_NOTIFY)
 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
 #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SYSCALL_TRACEPOINT(1 << TIF_SYSCALL_TRACEPOINT)
@@ -111,7 +113,7 @@ struct thread_info {
 
 #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
-_TIF_UPROBE)
+_TIF_UPROBE|_TIF_SEA_NOTIFY)
 
 #

[PATCH v2 2/3] GHES: add a notify chain for process memory section

2017-09-05 Thread Xie XiuQi

Add a notify chain for process memory section, with
which other modules might do error recovery.

Signed-off-by: Xie XiuQi <xiexi...@huawei.com>
Tested-by: Wang Xiongfeng <wangxiongfe...@huawei.com>
---
 drivers/acpi/apei/ghes.c | 10 ++
 include/acpi/ghes.h  |  8 
 2 files changed, 18 insertions(+)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 502335c..5b310b7 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -108,6 +108,9 @@ static inline bool is_hest_type_generic_v2(struct ghes 
*ghes)
 static LIST_HEAD(ghes_hed);
 static DEFINE_MUTEX(ghes_list_mutex);
 
+ATOMIC_NOTIFIER_HEAD(ghes_mem_err_chain);
+EXPORT_SYMBOL(ghes_mem_err_chain);
+
 /*
  * Because the memory area used to transfer hardware error information
  * from BIOS to Linux can be determined only in NMI, IRQ or timer
@@ -479,6 +482,13 @@ static void ghes_do_proc(struct ghes *ghes,
 
if (guid_equal(sec_type, _SEC_PLATFORM_MEM)) {
struct cper_sec_mem_err *mem_err = 
acpi_hest_get_payload(gdata);
+   struct ghes_mem_err mem;
+
+   mem.notify_type = ghes->generic->notify.type;
+   mem.severity = gdata->error_severity;
+   mem.mem_err = mem_err;
+
+   atomic_notifier_call_chain(_mem_err_chain, 0, 
);
 
ghes_edac_report_mem_error(ghes, sev, mem_err);
 
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 9f26e01..e26b783 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -115,4 +115,12 @@ static inline void *acpi_hest_get_next(struct 
acpi_hest_generic_data *gdata)
 
 int ghes_notify_sea(void);
 
+struct ghes_mem_err {
+   int notify_type;
+   int severity;
+   struct cper_sec_mem_err *mem_err;   
+};
+
+extern struct atomic_notifier_head ghes_mem_err_chain;
+
 #endif /* GHES_H */
-- 
1.8.3.1

[PATCH v2 3/3] arm64/ras: save error address from memory section for recovery

2017-09-05 Thread Xie XiuQi

In some platform, when SEA triggerred, physical address might be
reported by memory section, so we save it for error recovery later.

Signed-off-by: Xie XiuQi <xiexi...@huawei.com>
Tested-by: Wang Xiongfeng <wangxiongfe...@huawei.com>
---
 arch/arm64/kernel/ras.c | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/arch/arm64/kernel/ras.c b/arch/arm64/kernel/ras.c
index 5710b2e..8a44c01 100644
--- a/arch/arm64/kernel/ras.c
+++ b/arch/arm64/kernel/ras.c
@@ -141,3 +141,34 @@ void arm_process_error(struct ghes *ghes, struct 
cper_sec_proc_arm *err)
set_thread_flag(TIF_SEA_NOTIFY);
 }
 
+int ghes_mem_err_callback(struct notifier_block *nb, unsigned long val, void 
*data)
+{
+   bool info_saved = false;
+   struct ghes_mem_err *ghes_mem = (struct ghes_mem_err *)data;
+   struct cper_sec_mem_err *mem_err = ghes_mem->mem_err;
+
+   if ((ghes_mem->notify_type != ACPI_HEST_NOTIFY_SEA) ||
+   (ghes_mem->severity != CPER_SEV_RECOVERABLE))
+   return 0;
+
+   if (mem_err->validation_bits & CPER_MEM_VALID_PA) {
+   info_saved = sea_save_info(mem_err->physical_addr);
+   }
+
+   if (info_saved)
+   set_thread_flag(TIF_SEA_NOTIFY);
+
+   return 0;
+}
+
+static struct notifier_block ghes_mem_err_nb = {
+   .notifier_call  = ghes_mem_err_callback,
+};
+
+static int arm64_err_recov_init(void)
+{
+   atomic_notifier_chain_register(_mem_err_chain, _mem_err_nb);
+   return 0;
+}
+
+late_initcall(arm64_err_recov_init);
-- 
1.8.3.1

[PATCH v2 2/3] GHES: add a notify chain for process memory section

2017-09-05 Thread Xie XiuQi

Add a notify chain for process memory section, with
which other modules might do error recovery.

Signed-off-by: Xie XiuQi 
Tested-by: Wang Xiongfeng 
---
 drivers/acpi/apei/ghes.c | 10 ++
 include/acpi/ghes.h  |  8 
 2 files changed, 18 insertions(+)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 502335c..5b310b7 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -108,6 +108,9 @@ static inline bool is_hest_type_generic_v2(struct ghes 
*ghes)
 static LIST_HEAD(ghes_hed);
 static DEFINE_MUTEX(ghes_list_mutex);
 
+ATOMIC_NOTIFIER_HEAD(ghes_mem_err_chain);
+EXPORT_SYMBOL(ghes_mem_err_chain);
+
 /*
  * Because the memory area used to transfer hardware error information
  * from BIOS to Linux can be determined only in NMI, IRQ or timer
@@ -479,6 +482,13 @@ static void ghes_do_proc(struct ghes *ghes,
 
if (guid_equal(sec_type, _SEC_PLATFORM_MEM)) {
struct cper_sec_mem_err *mem_err = 
acpi_hest_get_payload(gdata);
+   struct ghes_mem_err mem;
+
+   mem.notify_type = ghes->generic->notify.type;
+   mem.severity = gdata->error_severity;
+   mem.mem_err = mem_err;
+
+   atomic_notifier_call_chain(_mem_err_chain, 0, 
);
 
ghes_edac_report_mem_error(ghes, sev, mem_err);
 
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 9f26e01..e26b783 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -115,4 +115,12 @@ static inline void *acpi_hest_get_next(struct 
acpi_hest_generic_data *gdata)
 
 int ghes_notify_sea(void);
 
+struct ghes_mem_err {
+   int notify_type;
+   int severity;
+   struct cper_sec_mem_err *mem_err;   
+};
+
+extern struct atomic_notifier_head ghes_mem_err_chain;
+
 #endif /* GHES_H */
-- 
1.8.3.1

[PATCH v2 3/3] arm64/ras: save error address from memory section for recovery

2017-09-05 Thread Xie XiuQi

In some platform, when SEA triggerred, physical address might be
reported by memory section, so we save it for error recovery later.

Signed-off-by: Xie XiuQi 
Tested-by: Wang Xiongfeng 
---
 arch/arm64/kernel/ras.c | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/arch/arm64/kernel/ras.c b/arch/arm64/kernel/ras.c
index 5710b2e..8a44c01 100644
--- a/arch/arm64/kernel/ras.c
+++ b/arch/arm64/kernel/ras.c
@@ -141,3 +141,34 @@ void arm_process_error(struct ghes *ghes, struct 
cper_sec_proc_arm *err)
set_thread_flag(TIF_SEA_NOTIFY);
 }
 
+int ghes_mem_err_callback(struct notifier_block *nb, unsigned long val, void 
*data)
+{
+   bool info_saved = false;
+   struct ghes_mem_err *ghes_mem = (struct ghes_mem_err *)data;
+   struct cper_sec_mem_err *mem_err = ghes_mem->mem_err;
+
+   if ((ghes_mem->notify_type != ACPI_HEST_NOTIFY_SEA) ||
+   (ghes_mem->severity != CPER_SEV_RECOVERABLE))
+   return 0;
+
+   if (mem_err->validation_bits & CPER_MEM_VALID_PA) {
+   info_saved = sea_save_info(mem_err->physical_addr);
+   }
+
+   if (info_saved)
+   set_thread_flag(TIF_SEA_NOTIFY);
+
+   return 0;
+}
+
+static struct notifier_block ghes_mem_err_nb = {
+   .notifier_call  = ghes_mem_err_callback,
+};
+
+static int arm64_err_recov_init(void)
+{
+   atomic_notifier_chain_register(_mem_err_chain, _mem_err_nb);
+   return 0;
+}
+
+late_initcall(arm64_err_recov_init);
-- 
1.8.3.1

1 2 3 4 5 >

1 - 100 of 472 matches

Mail list logo