task_diag is based on netlink sockets and looks like socket-diag, which
is used to get information about sockets.

task_diag is a new interface which is going to raplace the proc file
system in cases when we need to get information in a binary format.

A request messages is described by the task_diag_pid structure:
struct task_diag_pid {
       __u64   show_flags;
       __u64   dump_strategy;

       __u32   pid;
};

A respone is a set of netlink messages. Each message describes one task.
All task properties are divided on groups. A message contains the
TASK_DIAG_PID group, and other groups if they have been requested in
show_flags. For example, if show_flags contains TASK_DIAG_SHOW_BASE, a
response will contain the TASK_DIAG_CRED group which is described by the
task_diag_creds structure.

struct task_diag_base {
        __u32   tgid;
        __u32   pid;
        __u32   ppid;
        __u32   tpid;
        __u32   sid;
        __u32   pgid;
        __u8    state;
        char    comm[TASK_DIAG_COMM_LEN];
};

The dump_strategy field will be used in following patches to request
information for a group of processes.

v2: A few changes from David Ahern
    Use a consistent name
    Add max attr enum
    task diag: Send pid as u32
    Change _MSG/msg references to base
    Fix 8-byte alignment

Cc: David Ahern <[email protected]>
Signed-off-by: Andrey Vagin <[email protected]>
---
 include/linux/taskstats_kern.h |   7 ++
 include/uapi/linux/task_diag.h |  60 +++++++++++++++
 include/uapi/linux/taskstats.h |   2 +
 init/Kconfig                   |  12 +++
 kernel/Makefile                |   1 +
 kernel/taskdiag.c              | 168 +++++++++++++++++++++++++++++++++++++++++
 kernel/taskstats.c             |  25 +++++-
 7 files changed, 271 insertions(+), 4 deletions(-)
 create mode 100644 include/uapi/linux/task_diag.h
 create mode 100644 kernel/taskdiag.c

diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h
index 58de6ed..a1fd4f8 100644
--- a/include/linux/taskstats_kern.h
+++ b/include/linux/taskstats_kern.h
@@ -15,6 +15,8 @@
 extern struct kmem_cache *taskstats_cache;
 extern struct mutex taskstats_exit_mutex;
 
+extern struct genl_family taskstats_family;
+
 static inline void taskstats_tgid_free(struct signal_struct *sig)
 {
        if (sig->stats)
@@ -23,6 +25,11 @@ static inline void taskstats_tgid_free(struct signal_struct 
*sig)
 
 extern void taskstats_exit(struct task_struct *, int group_dead);
 extern void taskstats_init_early(void);
+
+struct genl_info;
+struct sk_buff;
+int taskdiag_doit(struct sk_buff *skb, struct genl_info *info);
+
 #else
 static inline void taskstats_exit(struct task_struct *tsk, int group_dead)
 {}
diff --git a/include/uapi/linux/task_diag.h b/include/uapi/linux/task_diag.h
new file mode 100644
index 0000000..3a1e6c4
--- /dev/null
+++ b/include/uapi/linux/task_diag.h
@@ -0,0 +1,60 @@
+#ifndef _LINUX_TASK_DIAG_H
+#define _LINUX_TASK_DIAG_H
+
+#include <linux/types.h>
+#include <linux/capability.h>
+
+enum {
+       /* optional attributes which can be specified in show_flags */
+       TASK_DIAG_BASE  = 0,
+
+       /* other attributes */
+       TASK_DIAG_PID   = 64,   /* u32 */
+
+       __TASK_DIAG_ATTR_MAX
+#define TASK_DIAG_ATTR_MAX (__TASK_DIAG_ATTR_MAX - 1)
+};
+
+#define TASK_DIAG_SHOW_BASE    (1ULL << TASK_DIAG_BASE)
+
+enum {
+       TASK_DIAG_RUNNING,
+       TASK_DIAG_INTERRUPTIBLE,
+       TASK_DIAG_UNINTERRUPTIBLE,
+       TASK_DIAG_STOPPED,
+       TASK_DIAG_TRACE_STOP,
+       TASK_DIAG_DEAD,
+       TASK_DIAG_ZOMBIE,
+};
+
+#define TASK_DIAG_COMM_LEN 16
+
+struct task_diag_base {
+       __u32   tgid;
+       __u32   pid;
+       __u32   ppid;
+       __u32   tpid;
+       __u32   sid;
+       __u32   pgid;
+       __u8    state;
+       char    comm[TASK_DIAG_COMM_LEN];
+};
+
+#define TASK_DIAG_DUMP_ALL     0
+
+struct task_diag_pid {
+       __u64   show_flags;
+       __u64   dump_strategy;
+
+       __u32   pid;
+};
+
+enum {
+       TASK_DIAG_CMD_ATTR_UNSPEC = 0,
+       TASK_DIAG_CMD_ATTR_GET,
+       __TASK_DIAG_CMD_ATTR_MAX,
+};
+
+#define TASK_DIAG_CMD_ATTR_MAX (__TASK_DIAG_CMD_ATTR_MAX - 1)
+
+#endif /* _LINUX_TASK_DIAG_H */
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
index a1cc91b..04b974a 100644
--- a/include/uapi/linux/taskstats.h
+++ b/include/uapi/linux/taskstats.h
@@ -181,6 +181,8 @@ enum {
        CGROUPSTATS_CMD_GET,            /* user->kernel request/get-response */
        CGROUPSTATS_CMD_NEW,            /* kernel->user event */
 
+       TASK_DIAG_CMD_GET,
+
        __TASKSTATS_CMD_MAX,
 };
 
diff --git a/init/Kconfig b/init/Kconfig
index 7d1ffd2..4d0483c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -432,6 +432,18 @@ config TASKSTATS
 
          Say N if unsure.
 
+config TASK_DIAG
+       bool "Export task/process properties through netlink"
+       depends on NET && TASKSTATS
+       default n
+       help
+         Export selected properties for tasks/processes through the
+         generic netlink interface. Unlike the proc file system, task_diag
+         returns information in a binary format, allows to specify which
+         information are required.
+
+         Say N if unsure.
+
 config TASK_DELAY_ACCT
        bool "Enable per-task delay accounting"
        depends on TASKSTATS
diff --git a/kernel/Makefile b/kernel/Makefile
index 60c302c..ed6fed5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -98,6 +98,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
 obj-$(CONFIG_TORTURE_TEST) += torture.o
+obj-$(CONFIG_TASK_DIAG) += taskdiag.o
 
 $(obj)/configs.o: $(obj)/config_data.h
 
diff --git a/kernel/taskdiag.c b/kernel/taskdiag.c
new file mode 100644
index 0000000..7327e08
--- /dev/null
+++ b/kernel/taskdiag.c
@@ -0,0 +1,168 @@
+#include <linux/kernel.h>
+#include <linux/taskstats_kern.h>
+#include <linux/task_diag.h>
+#include <net/genetlink.h>
+#include <linux/pid_namespace.h>
+#include <linux/ptrace.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+
+static size_t taskdiag_packet_size(u64 show_flags)
+{
+       size_t size;
+
+       size = nla_total_size(sizeof(u32)); /* PID */
+
+       if (show_flags & TASK_DIAG_SHOW_BASE)
+               size += nla_total_size(sizeof(struct task_diag_base));
+
+       return size;
+}
+
+/*
+ * The task state array is a strange "bitmap" of
+ * reasons to sleep. Thus "running" is zero, and
+ * you can test for combinations of others with
+ * simple bit tests.
+ */
+static const __u8 task_state_array[] = {
+       TASK_DIAG_RUNNING,
+       TASK_DIAG_INTERRUPTIBLE,
+       TASK_DIAG_UNINTERRUPTIBLE,
+       TASK_DIAG_STOPPED,
+       TASK_DIAG_TRACE_STOP,
+       TASK_DIAG_DEAD,
+       TASK_DIAG_ZOMBIE,
+};
+
+static inline const __u8 get_task_state(struct task_struct *tsk)
+{
+       unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT;
+
+       BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1);
+
+       return task_state_array[fls(state)];
+}
+
+static int fill_task_base(struct task_struct *p, struct sk_buff *skb)
+{
+       struct pid_namespace *ns = task_active_pid_ns(current);
+       struct task_diag_base *base;
+       struct nlattr *attr;
+       char tcomm[sizeof(p->comm)];
+       struct task_struct *tracer;
+
+       attr = nla_reserve(skb, TASK_DIAG_BASE, sizeof(struct task_diag_base));
+       if (!attr)
+               return -EMSGSIZE;
+
+       base = nla_data(attr);
+
+       rcu_read_lock();
+       base->ppid = pid_alive(p) ?
+               task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
+
+       base->tpid = 0;
+       tracer = ptrace_parent(p);
+       if (tracer)
+               base->tpid = task_pid_nr_ns(tracer, ns);
+
+       base->tgid = task_tgid_nr_ns(p, ns);
+       base->pid = task_pid_nr_ns(p, ns);
+       base->sid = task_session_nr_ns(p, ns);
+       base->pgid = task_pgrp_nr_ns(p, ns);
+
+       rcu_read_unlock();
+
+       get_task_comm(tcomm, p);
+       memset(base->comm, 0, TASK_DIAG_COMM_LEN);
+       strncpy(base->comm, tcomm, TASK_DIAG_COMM_LEN);
+
+       base->state = get_task_state(p);
+
+       return 0;
+}
+
+static int task_diag_fill(struct task_struct *tsk, struct sk_buff *skb,
+                               u64 show_flags, u32 portid, u32 seq)
+{
+       void *reply;
+       int err;
+       u32 pid;
+
+       reply = genlmsg_put(skb, portid, seq, &taskstats_family, 0, 
TASK_DIAG_CMD_GET);
+       if (reply == NULL)
+               return -EMSGSIZE;
+
+       pid = task_pid_vnr(tsk);
+       err = nla_put_u32(skb, TASK_DIAG_PID, pid);
+       if (err)
+               goto err;
+
+       if (show_flags & TASK_DIAG_SHOW_BASE) {
+               err = fill_task_base(tsk, skb);
+               if (err)
+                       goto err;
+       }
+
+       genlmsg_end(skb, reply);
+       return 0;
+err:
+       genlmsg_cancel(skb, reply);
+       return err;
+}
+
+int taskdiag_doit(struct sk_buff *skb, struct genl_info *info)
+{
+       struct nlattr *nla = info->attrs[TASK_DIAG_CMD_ATTR_GET];
+       struct task_struct *tsk = NULL;
+       struct task_diag_pid req;
+       struct sk_buff *msg;
+       size_t size;
+       int rc;
+
+       if (!nla_data(nla))
+               return -EINVAL;
+
+       if (nla_len(nla) < sizeof(req))
+               return -EINVAL;
+
+       /*
+        * use a req variable to deal with alignment issues. task_diag_pid
+        * contains u64 elements which means extended load operations can be
+        * used and those can require 8-byte alignment (e.g., sparc)
+        */
+       memcpy(&req, nla_data(nla), sizeof(req));
+
+       size = taskdiag_packet_size(req.show_flags);
+       msg = genlmsg_new(size, GFP_KERNEL);
+       if (!msg)
+               return -ENOMEM;
+
+       rcu_read_lock();
+       tsk = find_task_by_vpid(req.pid);
+       if (tsk)
+               get_task_struct(tsk);
+       rcu_read_unlock();
+       if (!tsk) {
+               rc = -ESRCH;
+               goto err;
+       };
+
+       if (!ptrace_may_access(tsk, PTRACE_MODE_READ)) {
+               put_task_struct(tsk);
+               rc = -EPERM;
+               goto err;
+       }
+
+       rc = task_diag_fill(tsk, msg, req.show_flags,
+                               info->snd_portid, info->snd_seq);
+       put_task_struct(tsk);
+       if (rc < 0)
+               goto err;
+
+       return genlmsg_reply(msg, info);
+err:
+       nlmsg_free(msg);
+       return rc;
+}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 21f82c2..d70f1e5 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -18,6 +18,7 @@
 
 #include <linux/kernel.h>
 #include <linux/taskstats_kern.h>
+#include <linux/task_diag.h>
 #include <linux/tsacct_kern.h>
 #include <linux/delayacct.h>
 #include <linux/cpumask.h>
@@ -41,7 +42,7 @@ static DEFINE_PER_CPU(__u32, taskstats_seqnum);
 static int family_registered;
 struct kmem_cache *taskstats_cache;
 
-static struct genl_family family = {
+struct genl_family taskstats_family = {
        .id             = GENL_ID_GENERATE,
        .name           = TASKSTATS_GENL_NAME,
        .version        = TASKSTATS_GENL_VERSION,
@@ -92,9 +93,9 @@ static int prepare_reply(struct genl_info *info, u8 cmd, 
struct sk_buff **skbp,
        if (!info) {
                int seq = this_cpu_inc_return(taskstats_seqnum) - 1;
 
-               reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
+               reply = genlmsg_put(skb, 0, seq, &taskstats_family, 0, cmd);
        } else
-               reply = genlmsg_put_reply(skb, info, &family, 0, cmd);
+               reply = genlmsg_put_reply(skb, info, &taskstats_family, 0, cmd);
        if (reply == NULL) {
                nlmsg_free(skb);
                return -EINVAL;
@@ -664,6 +665,15 @@ err:
        nlmsg_free(rep_skb);
 }
 
+#ifdef CONFIG_TASK_DIAG
+static const struct nla_policy
+                       taskdiag_cmd_get_policy[TASK_DIAG_CMD_ATTR_MAX+1] = {
+       [TASK_DIAG_CMD_ATTR_GET]  = {   .type = NLA_UNSPEC,
+                                       .len = sizeof(struct task_diag_pid)
+                               },
+};
+#endif
+
 static const struct genl_ops taskstats_ops[] = {
        {
                .cmd            = TASKSTATS_CMD_GET,
@@ -676,6 +686,13 @@ static const struct genl_ops taskstats_ops[] = {
                .doit           = cgroupstats_user_cmd,
                .policy         = cgroupstats_cmd_get_policy,
        },
+#ifdef CONFIG_TASK_DIAG
+       {
+               .cmd            = TASK_DIAG_CMD_GET,
+               .doit           = taskdiag_doit,
+               .policy         = taskdiag_cmd_get_policy,
+       },
+#endif
 };
 
 /* Needed early in initialization */
@@ -694,7 +711,7 @@ static int __init taskstats_init(void)
 {
        int rc;
 
-       rc = genl_register_family_with_ops(&family, taskstats_ops);
+       rc = genl_register_family_with_ops(&taskstats_family, taskstats_ops);
        if (rc)
                return rc;
 
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to