General comments:
- care to CC scheduler and mm people?
- couldn't this share some code with fs/proc?
- I am not sure/convinced it is worth the hassle

On 30. 04. 20, 8:43, Arseny Maslennikov wrote:
> If the three termios local flags isig, icanon, iexten are enabled
> and the local flag nokerninfo is disabled for a tty governed
> by the n_tty line discipline, then on receiving the keyboard status
> character n_tty will generate a status message and write it out to
> the tty before sending SIGINFO to the tty's foreground process group.
> 
> This kerninfo line contains information about the current system load
> as well as some properties of "the most interesting" process in the
> tty's current foreground process group, namely:
>  - its PID as seen inside its deepest PID namespace;
>    * the whole process group ought to be in a single PID namespace,
>      so this is actually deterministic
>  - its saved command name truncated to 16 bytes (task_struct::comm);
>    * at the time of writing TASK_COMM_LEN == 16
>  - its state and some related bits, procps-style;
>  - for S and D: its symbolic wait channel, if available; or a short
>    description for other process states instead;
>  - its user, system and real rusage time values;
>  - its resident set size (as well as the high watermark) in kilobytes.
> 
> The "most interesting" process is chosen as follows:
>  - runnables over everything
>  - uninterruptibles over everything else
>  - among 2 runnables pick the biggest utime + stime
>  - any unresolved ties are decided in favour of greatest PID.
> 
> While the kerninfo line is not very useful for debugging the kernel
> itself, since we have much more powerful debugging tools, it still gives
> the user behind the terminal some meaningful feedback to a VSTATUS that
> works even if no processes respond.

Care to append an example output to the commit message?

> Signed-off-by: Arseny Maslennikov <[email protected]>
...
> index f72a3fd4b..905cdd985 100644
> --- a/drivers/tty/n_tty.c
> +++ b/drivers/tty/n_tty.c
> @@ -79,6 +80,8 @@
>  #define ECHO_BLOCK           256
>  #define ECHO_DISCARD_WATERMARK       N_TTY_BUF_SIZE - (ECHO_BLOCK + 32)
>  
> +#define STATUS_LINE_LEN (2 * KSYM_NAME_LEN)

It's too magic constant.

> @@ -2489,6 +2496,21 @@ static int n_tty_ioctl(struct tty_struct *tty, struct 
> file *file,
>       }
>  }
>  
> +static void n_tty_status_line(struct tty_struct *tty)
> +{
> +     struct n_tty_data *ldata = tty->disc_data;
> +     char *msg, *buf;
> +     msg = buf = kzalloc(STATUS_LINE_LEN, GFP_KERNEL);
> +     tty_sprint_status_line(tty, buf + 1, STATUS_LINE_LEN - 1);
> +     /* The only current caller of this takes output_lock for us. */

So add a lockdep assertion?

> +     if (ldata->column != 0)
> +             *msg = '\n';
> +     else
> +             msg++;
> +     do_n_tty_write(tty, NULL, msg, strlen(msg));
> +     kfree(buf);
> +}
> +
>  static struct tty_ldisc_ops n_tty_ops = {
>       .magic           = TTY_LDISC_MAGIC,
>       .name            = "n_tty",
> diff --git a/drivers/tty/n_tty_status.c b/drivers/tty/n_tty_status.c
> new file mode 100644
> index 000000000..d92261bbe
> --- /dev/null
> +++ b/drivers/tty/n_tty_status.c
> @@ -0,0 +1,338 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +
> +#include <linux/kallsyms.h>
> +#include <linux/pid.h>
> +#include <linux/sched.h>
> +#include <linux/sched/signal.h>
> +#include <linux/sched/loadavg.h>
> +#include <linux/sched/cputime.h>
> +#include <linux/sched/mm.h>
> +#include <linux/slab.h>
> +#include <linux/tty.h>
> +
> +#define BCOMPARE(lbool, rbool) ((lbool) << 1 | (rbool))
> +#define BCOMPARE_NONE 0
> +#define BCOMPARE_RIGHT 1
> +#define BCOMPARE_LEFT 2
> +#define BCOMPARE_BOTH (BCOMPARE_LEFT | BCOMPARE_RIGHT)
> +
> +/*
> + * Select the most interesting task of two.
> + *
> + * The implemented approach is simple for now:
> + * - pick runnable
> + * - if no runnables, pick uninterruptible
> + * - if tie between runnables, pick highest utime + stime
> + * - if a tie is not broken by the above, pick highest pid nr.
> + *
> + * For reference, here's the one used in FreeBSD:
> + * - pick runnables over anything
> + * - if both runnables, pick highest cpu utilization
> + * - if no runnables, pick shortest sleep time (the scheduler
> + *   actually takes care to maintain this statistic)
> + * - other ties are decided in favour of youngest process.
> + */
> +static struct task_struct *__better_proc_R(struct task_struct *a,

Why so weird name __better_proc_R?

> +             struct task_struct *b)
> +{
> +     unsigned long flags;
> +     u64 atime, btime, tgutime, tgstime;
> +     struct task_struct *ret;
> +
> +     if (!lock_task_sighand(a, &flags))
> +             goto out_a_unlocked;
> +     thread_group_cputime_adjusted(a, &tgutime, &tgstime);
> +     atime = tgutime + tgstime;
> +     unlock_task_sighand(a, &flags);
> +
> +     if (!lock_task_sighand(b, &flags))
> +             goto out_b_unlocked;
> +     thread_group_cputime_adjusted(b, &tgutime, &tgstime);
> +     btime = tgutime + tgstime;
> +     unlock_task_sighand(b, &flags);
> +
> +     ret = atime > btime ? a : b;
> +
> +     return ret;
> +
> +out_b_unlocked:
> +out_a_unlocked:
> +     return task_pid_vinr(a) > task_pid_vinr(b) ? a : b;
> +}
> +
> +static struct task_struct *__better_proc(struct task_struct *a,

Again, why "__" in the name?

> +             struct task_struct *b)
> +{
> +     if (!pid_alive(a))
> +             return b;
> +     if (!pid_alive(b))
> +             return a;
> +
> +     switch (BCOMPARE(a->state == TASK_RUNNING,
> +                     b->state == TASK_RUNNING)) {
> +     case BCOMPARE_LEFT:
> +             return a;
> +     case BCOMPARE_RIGHT:
> +             return b;
> +     case BCOMPARE_BOTH:
> +             return __better_proc_R(a, b);
> +     }

Doesn't this look saner and better:

if (a->state == TASK_RUNNING && b->state == TASK_RUNNING)
  return __better_proc_R(a, b);
if (a->state == TASK_RUNNING)
  return a;
if (b->state == TASK_RUNNING)
  return b;

?

And one doesn't need to decrypt the defines.

> +     switch (BCOMPARE(a->state == TASK_UNINTERRUPTIBLE,
> +                     b->state == TASK_UNINTERRUPTIBLE)) {
> +     case BCOMPARE_LEFT:
> +             return a;
> +     case BCOMPARE_RIGHT:
> +             return b;
> +     case BCOMPARE_BOTH:
> +             break;

dtto.

> +     }
> +
> +     /* TODO: Perhaps we should check something else... */
> +     return task_pid_vinr(a) > task_pid_vinr(b) ? a : b;
> +}
> +
> +/* Weed out NULLs. */
> +static inline struct task_struct *better_proc(struct task_struct *a,
> +             struct task_struct *b) {
> +     return a ? (b ? __better_proc(a, b) : a) : b;

This urgently calls for ifs and not ternany operators.

Actually you don't need this helper at all. Check a and b in the same if
as the respective !pid_alive above.

> +}
> +
> +static int scnprint_load(char *msgp, size_t size)
> +{
> +     unsigned long la[3];
> +
> +     get_avenrun(la, FIXED_1/200, 0);
> +     return scnprintf(msgp, size, "load: %lu.%02lu; ",
> +                     LOAD_INT(la[0]), LOAD_FRAC(la[0]));
> +}
> +
> +static int scnprint_task(char *msgp, size_t size, struct task_struct *task)
> +{
> +     char commname[TASK_COMM_LEN];
> +
> +     get_task_comm(commname, task);
> +     return scnprintf(msgp, size, "%d/%s:", task_pid_vinr(task), commname);
> +}
> +
> +static int scnprint_rusage(char *msgp, ssize_t size,
> +             struct task_struct *task, struct mm_struct *mm)
> +{
> +     struct rusage ru;
> +     struct timespec64 utime, stime;
> +     struct timespec64 rtime;
> +     u64 now;
> +     int ret = 0;
> +     int psz = 0;
> +
> +     getrusage(task, RUSAGE_BOTH, &ru);
> +     now = ktime_get_ns();
> +
> +     utime.tv_sec = ru.ru_utime.tv_sec;
> +     utime.tv_nsec = ru.ru_utime.tv_usec * NSEC_PER_USEC;
> +     stime.tv_sec = ru.ru_stime.tv_sec;
> +     stime.tv_nsec = ru.ru_stime.tv_usec * NSEC_PER_USEC;
> +
> +     rtime = ns_to_timespec64(now - task->start_time);
> +
> +     psz = scnprintf(msgp, size,
> +                     "%llu.%03lur %llu.%03luu %llu.%03lus",
> +                     rtime.tv_sec, rtime.tv_nsec / NSEC_PER_MSEC,
> +                     utime.tv_sec, utime.tv_nsec / NSEC_PER_MSEC,
> +                     stime.tv_sec, stime.tv_nsec / NSEC_PER_MSEC);
> +     ret += psz;
> +     msgp += psz;
> +     size -= psz;
> +
> +     if (mm) {
> +             psz = scnprintf(msgp, size,
> +                             " %luk/%luk",
> +                             get_mm_rss(mm) * PAGE_SIZE / 1024,
> +                             get_mm_hiwater_rss(mm) * PAGE_SIZE / 1024);
> +             ret += psz;
> +     }
> +     return ret;
> +}
> +
> +static int scnprint_state(char *msgp, ssize_t size,
> +             struct task_struct *task, struct mm_struct *mm)
> +{
> +     char stat[8] = {0};
> +     const char *state_descr = "";
> +     struct task_struct *parent = NULL;
> +     struct task_struct *real_parent = NULL;
> +     unsigned long wchan = 0;
> +     int psz = 0;
> +     char symname[KSYM_NAME_LEN];
> +
> +     stat[psz++] = task_state_to_char(task);
> +     if (task_nice(task) < 0)
> +             stat[psz++] = '<';
> +     else if (task_nice(task) > 0)
> +             stat[psz++] = 'N';
> +     if (mm && mm->locked_vm)
> +             stat[psz++] = 'L';
> +     if (get_nr_threads(task) > 1)
> +             stat[psz++] = 'l';
> +
> +     switch (stat[0]) {
> +     case 'R':
> +             if (task_curr(task))
> +                     stat[psz++] = '!';
> +             break;
> +     case 'S':
> +     case 'D':
> +             wchan = get_wchan(task);
> +             if (!wchan)
> +                     break;
> +             if (!lookup_symbol_name(wchan, symname))
> +                     state_descr = symname;
> +             else
> +                     /* get_wchan() returned something
> +                      * that looks like no kernel symbol.
> +                      */
> +                     state_descr = "*unknown*";
> +             break;
> +     case 'T':
> +             state_descr = "stopped";
> +             break;
> +     case 't':
> +             state_descr = "traced";
> +             break;
> +     case 'Z':
> +             rcu_read_lock();
> +             real_parent = rcu_dereference(task->real_parent);
> +             parent = rcu_dereference(task->parent);
> +             psz = sprintf(symname, "zombie; ppid=%d",
> +                     task_tgid_nr_ns(real_parent,
> +                             ns_of_pid(task_pid(task))));
> +             if (parent != real_parent)
> +                     sprintf(symname + psz, " reaper=%d",
> +                             task_tgid_nr_ns(parent,
> +                                     ns_of_pid(task_pid(task))));
> +             rcu_read_unlock();
> +             state_descr = symname;
> +             break;
> +     case 'I':
> +             /* Can this even happen? */
> +             state_descr = "idle";
> +             break;
> +     default:
> +             state_descr = "unknown";
> +     }
> +
> +     psz = scnprintf(msgp, size, "%s", stat);
> +     msgp += psz;
> +     size -= psz;
> +     if (*state_descr)
> +             psz += scnprintf(msgp, size, wchan ? " [%s]" : " (%s)", 
> state_descr);
> +
> +     return psz;
> +}
> +
> +/**
> + *   tty_sprint_status_line  -               produce kerninfo line
> + *   @tty: terminal device
> + *   @msg: preallocated memory buffer
> + *   @length: maximum line length
> + *
> + *   Reports state of foreground process group in a null-terminated string
> + *   located at @msg, @length bytes long. If @length is insufficient,
> + *   the line gets truncated.
> + */
> +void tty_sprint_status_line(struct tty_struct *t, char *msg, size_t length)
> +{
> +     struct task_struct *tsk = NULL, *mit = NULL;
> +     struct mm_struct *mm;
> +     struct pid *pgrp = NULL;
> +     char *msgp = msg;
> +     int psz = 0;
> +
> +     if (!length)
> +             return;
> +     length--; /* Make room for trailing '\n' */
> +
> +     psz = scnprint_load(msgp, length);
> +     if (psz > 0) {
> +             msgp += psz;
> +             length -= psz;
> +     }
> +     if (!length)
> +             goto finalize_message;
> +
> +     /* Not sure if session pid is protected by ctrl_lock
> +      * or tasklist_lock...
> +      */
> +     pgrp = t->session;
> +     if (pgrp == NULL) {
> +             psz = scnprintf(msgp, length, "not a controlling tty");
> +             if (psz > 0)
> +                     msgp += psz;
> +             goto finalize_message;
> +     }
> +     pgrp = tty_get_pgrp(t);
> +     if (pgrp == NULL) {
> +             psz = scnprintf(msgp, length, "no foreground process group");
> +             if (psz > 0)
> +                     msgp += psz;
> +             goto finalize_message;
> +     }
> +
> +     read_lock(&tasklist_lock);
> +     do_each_pid_task(pgrp, PIDTYPE_PGID, tsk)
> +     {
> +             /* Select the most interesting task. */
> +             if (tsk == better_proc(mit, tsk))
> +                     mit = tsk;
> +     } while_each_pid_task(pgrp, PIDTYPE_PGID, tsk);
> +     read_unlock(&tasklist_lock);
> +
> +     if (!pid_alive(mit))
> +             goto finalize_message;
> +
> +     /* Gather intel on most interesting task. */
> +     /* Can the mm of a foreground process turn out to be NULL?
> +      * Definitely; for example, if it is a zombie.
> +      */
> +     mm = get_task_mm(mit);
> +
> +     psz = scnprint_task(msgp, length, mit);
> +     if (psz > 0) {
> +             msgp += psz;
> +             length -= psz;
> +     }
> +     if (!length)
> +             goto finalize_message;
> +     *msgp++ = ' ';
> +     length--;
> +
> +     psz = scnprint_state(msgp, length, mit, mm);
> +     if (psz > 0) {
> +             msgp += psz;
> +             length -= psz;
> +     }
> +     if (!length)
> +             goto finalize_message;
> +     *msgp++ = ' ';
> +     length--;
> +
> +     psz = scnprint_rusage(msgp, length, mit, mm);
> +     if (psz > 0) {
> +             msgp += psz;
> +             length -= psz;
> +     }
> +     if (!length)
> +             goto finalize_message;
> +     *msgp++ = ' ';
> +     length--;
> +
> +     if (!mm)
> +             goto finalize_message;
> +
> +     mmput(mm);
> +
> +finalize_message:
> +     *msgp++ = '\n';
> +     if (pgrp)
> +             put_pid(pgrp);
> +}
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 4418f5cb8..195abd47d 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1318,6 +1318,8 @@ static inline struct pid *task_pid(struct task_struct 
> *task)
>   * task_xid_nr()     : global id, i.e. the id seen from the init namespace;
>   * task_xid_vnr()    : virtual id, i.e. the id seen from the pid namespace of
>   *                     current.
> + * task_xid_vinr()   : virtual inner id, i.e. the id seen from the namespace 
> of
> + *                     the task itself;
>   * task_xid_nr_ns()  : id seen from the ns specified;
>   *
>   * see also pid_nr() etc in include/linux/pid.h
> @@ -1339,6 +1341,11 @@ static inline pid_t task_pid_vnr(struct task_struct 
> *tsk)
>       return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
>  }
>  
> +static inline pid_t task_pid_vinr(struct task_struct *tsk)
> +{
> +     return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns_of_pid(task_pid(tsk)));
> +}
> +

This smells like it should be done in a separate patch.

>  static inline pid_t task_tgid_nr(struct task_struct *tsk)
>  {
> diff --git a/include/linux/tty.h b/include/linux/tty.h
> index 3499845ab..2023addaf 100644
> --- a/include/linux/tty.h
> +++ b/include/linux/tty.h
> @@ -727,6 +727,9 @@ extern void __init n_tty_init(void);
>  static inline void n_tty_init(void) { }
>  #endif
>  
> +/* n_tty_status.c */
> +extern void tty_sprint_status_line(struct tty_struct *tty, char *msg, size_t 
> size);

No need for extern.

thanks,
-- 
js
suse labs

Reply via email to