On 4/6/21 5:15 AM, Peter Zijlstra wrote:
On Mon, Apr 05, 2021 at 07:42:03PM -0400, Waiman Long wrote:
The handling of sysrq key can be activated by echoing the key to
/proc/sysrq-trigger or via the magic key sequence typed into a terminal
that is connected to the system in some way (serial, USB or other mean).
In the former case, the handling is done in a user context. In the
latter case, it is likely to be in an interrupt context.
[ 7809.796281]  </NMI>
[ 7809.796282]  _raw_spin_lock_irqsave+0x32/0x40
[ 7809.796283]  print_cpu+0x261/0x7c0
[ 7809.796283]  sysrq_sched_debug_show+0x34/0x50
[ 7809.796284]  sysrq_handle_showstate+0xc/0x20
[ 7809.796284]  __handle_sysrq.cold.11+0x48/0xfb
[ 7809.796285]  write_sysrq_trigger+0x2b/0x30
[ 7809.796285]  proc_reg_write+0x39/0x60
[ 7809.796286]  vfs_write+0xa5/0x1a0
[ 7809.796286]  ksys_write+0x4f/0xb0
[ 7809.796287]  do_syscall_64+0x5b/0x1a0
[ 7809.796287]  entry_SYSCALL_64_after_hwframe+0x65/0xca
[ 7809.796288] RIP: 0033:0x7fabe4ceb648

The purpose of sched_debug_lock is to serialize the use of the global
cgroup_path[] buffer in print_cpu(). The rests of the printk calls don't
need serialization from sched_debug_lock.
The print_cpu() function has two callers - sched_debug_show() and
sysrq_sched_debug_show().
So what idiot is doing sysrq and that proc file at the same time? Why is
it a problem now?
We got a customer bug report on watchdog panic because a process somehow stay within the sched_debug_lock for too long. I don't know what exactly the customer was doing, but we can reproduce the panic just by having 2 parallel "echo t > /proc/sysrq-trigger" commands. This happens on certain selected systems. I was using some Dell systems for my testing. Of course, it is not sensible to have more than one sysrq happening at the same time. However, a parallel thread accessing /proc/sched_debug is certainly possible. That invokes similar code path.

@@ -470,16 +468,49 @@ static void print_cfs_group_stats(struct seq_file *m, int 
cpu, struct task_group
  #endif
#ifdef CONFIG_CGROUP_SCHED
+static DEFINE_SPINLOCK(sched_debug_lock);
  static char group_path[PATH_MAX];
+static enum {
+       TOKEN_NONE,
+       TOKEN_ACQUIRED,
+       TOKEN_NA        /* Not applicable */
+} console_token = TOKEN_ACQUIRED;
+/*
+ * All the print_cpu() callers from sched_debug_show() will be allowed
+ * to contend for sched_debug_lock and use group_path[] as their SEQ_printf()
+ * calls will be much faster. However only one print_cpu() caller from
+ * sysrq_sched_debug_show() which outputs to the console will be allowed
+ * to use group_path[]. Another parallel console writer will have to use
+ * a shorter stack buffer instead. Since the console output will be garbled
+ * anyway, truncation of some cgroup paths shouldn't be a big issue.
+ */
+#define SEQ_printf_task_group_path(m, tg, fmt...)                      \
+{                                                                      \
+       unsigned long flags;                                            \
+       int token = m ? TOKEN_NA                                        \
+                     : xchg_acquire(&console_token, TOKEN_NONE);   \
+                                                                       \
+       if (token == TOKEN_NONE) {                                      \
+               char buf[128];                                          \
+               task_group_path(tg, buf, sizeof(buf));                  \
+               SEQ_printf(m, fmt, buf);                                \
+       } else {                                                        \
+               spin_lock_irqsave(&sched_debug_lock, flags);                \
+               task_group_path(tg, group_path, sizeof(group_path));    \
+               SEQ_printf(m, fmt, group_path);                         \
+               spin_unlock_irqrestore(&sched_debug_lock, flags);   \
+               if (token == TOKEN_ACQUIRED)                            \
+                       smp_store_release(&console_token, token);   \
+       }                                                               \
  }
This is disgusting... you have an open-coded test-and-set lock like
thing *AND* a spinlock, what gives?


What's wrong with something simple like this?

---
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4b49cc2af5c4..2ac2977f3b96 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -8,8 +8,6 @@
   */
  #include "sched.h"
-static DEFINE_SPINLOCK(sched_debug_lock);
-
  /*
   * This allows printing both to /proc/sched_debug and
   * to the console
@@ -470,6 +468,7 @@ static void print_cfs_group_stats(struct seq_file *m, int 
cpu, struct task_group
  #endif
#ifdef CONFIG_CGROUP_SCHED
+static DEFINE_SPINLOCK(group_path_lock);
  static char group_path[PATH_MAX];
static char *task_group_path(struct task_group *tg)
@@ -481,6 +480,22 @@ static char *task_group_path(struct task_group *tg)
return group_path;
  }
+
+#define SEQ_printf_task_group_path(m, tg)                              \
+do {                                                                   \
+       if (spin_trylock(&group_path_lock)) {                               \
+               task_group_path(tg, group_path, sizeof(group_path));    \
+               SEQ_printf(m, "%s", group_path);                      \
+               spin_unlock(&group_path_lock);                              \
+       } else {                                                        \
+               SEQ_printf(m, "looser!");                             \
+       }
+} while (0)

I have no problem with using this as a possible solution as long as others agree. Of course, I won't use the term "looser". I will be more polite:-)

The current patch allows all /proc/sched_debug users and one sysrq user to use the full group_path[] buffer. I can certainly change it to allow 1 user to use the full size path and the rests running the risk of path truncation.

Cheers,
Longman

Reply via email to